From 6e1012272b3483a18d56c2af43b1172628b9711d Mon Sep 17 00:00:00 2001
From: shihaobai <42648726+shihaobai@users.noreply.github.com>
Date: Mon, 8 Dec 2025 20:52:03 +0800
Subject: [PATCH 01/65] Rl weight (#1143)

Co-authored-by: sufubao <sufubao@sensetime.com>
---
 lightllm/common/basemodel/basemodel.py        |   9 -
 .../layer_weights/meta_weights/__init__.py    |   5 +-
 .../{ => fused_moe}/fused_moe_weight_ep.py    | 182 +++++++--
 .../fused_moe_weight_ep_redundancy.py         |  12 +-
 .../fused_moe/fused_moe_weight_tp.py          | 325 ++++++++++++++++
 .../gpt_oss_fused_moe_weight_tp.py            |   2 +-
 .../meta_weights/mm_weight/__init__.py        |   9 +-
 .../meta_weights/mm_weight/colmm_weight.py    |  82 +----
 .../meta_weights/mm_weight/mm_factory.py      |  90 -----
 .../meta_weights/mm_weight/mm_slicer.py       |  18 +
 .../meta_weights/mm_weight/mm_weight.py       | 348 +++---------------
 .../meta_weights/mm_weight/rowmm_weight.py    |  88 +----
 .../layer_weights/meta_weights/norm_weight.py | 152 +++-----
 .../layer_weights/transformer_layer_weight.py |   4 +
 lightllm/common/quantization/__init__.py      |   5 +-
 lightllm/common/quantization/awq_quant.py     | 139 ++++---
 .../common/quantization/deepgemm_quant.py     |  55 ++-
 lightllm/common/quantization/no_quant.py      |  52 +++
 .../common/quantization/quantize_method.py    |  66 +++-
 lightllm/common/quantization/registry.py      |   5 +-
 lightllm/common/quantization/torchao_quant.py |   9 +-
 .../quantization/triton_quant/triton_quant.py |  43 ++-
 lightllm/common/quantization/w8a8_quant.py    | 100 +++--
 .../layer_weights/transformer_layer_weight.py |   4 +-
 .../layer_weights/transformer_layer_weight.py |  49 ++-
 .../layer_weights/transformer_layer_weight.py |   1 +
 .../layer_weights/transformer_layer_weight.py |   9 -
 .../layer_weights/transformer_layer_weight.py |   5 +-
 .../pre_and_post_layer_weight.py              |   1 +
 .../pre_and_post_layer_weight.py              |   1 +
 .../pre_and_post_layer_weight.py              |  54 ++-
 .../mode_backend/redundancy_expert_manager.py |   4 +-
 32 files changed, 1044 insertions(+), 884 deletions(-)
 rename lightllm/common/basemodel/layer_weights/meta_weights/{ => fused_moe}/fused_moe_weight_ep.py (74%)
 rename lightllm/common/basemodel/layer_weights/meta_weights/{ => fused_moe}/fused_moe_weight_ep_redundancy.py (96%)
 create mode 100644 lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/fused_moe_weight_tp.py
 rename lightllm/common/basemodel/layer_weights/meta_weights/{ => fused_moe}/gpt_oss_fused_moe_weight_tp.py (99%)
 delete mode 100644 lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/mm_factory.py
 create mode 100644 lightllm/common/quantization/no_quant.py

diff --git a/lightllm/common/basemodel/basemodel.py b/lightllm/common/basemodel/basemodel.py
index 26d51af3d..25171df2a 100755
--- a/lightllm/common/basemodel/basemodel.py
+++ b/lightllm/common/basemodel/basemodel.py
@@ -179,15 +179,6 @@ def _init_weights(self, start_layer_index=0):
             )
             for i in range(start_layer_index, start_layer_index + self.config["n_layer"])
         ]
-        load_hf_weights(
-            self.data_type,
-            weight_dir=self.weight_dir_,
-            pre_post_layer=self.pre_post_weight,
-            transformer_layer_list=self.trans_layers_weight,
-            weight_dict=self.weight_dict,
-        )
-        self.pre_post_weight.verify_load()
-        [weight.verify_load() for weight in self.trans_layers_weight]
         return
 
     def _init_mem_manager(self):
diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/__init__.py b/lightllm/common/basemodel/layer_weights/meta_weights/__init__.py
index 0fa02780c..72e0034cb 100644
--- a/lightllm/common/basemodel/layer_weights/meta_weights/__init__.py
+++ b/lightllm/common/basemodel/layer_weights/meta_weights/__init__.py
@@ -1,13 +1,12 @@
 from .base_weight import BaseWeight
 from .mm_weight import (
-    MMWeightPack,
     MMWeightTpl,
     ROWMMWeight,
     COLMMWeight,
     ROWBMMWeight,
 )
 from .norm_weight import NoTpGEMMANormWeight, TpVitPadNormWeight, NoTpNormWeight, TpHeadNormWeight
-from .fused_moe_weight_tp import create_tp_moe_wegiht_obj
-from .fused_moe_weight_ep import FusedMoeWeightEP
 from .embedding_weight import EmbeddingWeight, LMHeadWeight, NoTpPosEmbeddingWeight
 from .att_sink_weight import TpAttSinkWeight
+from .fused_moe.fused_moe_weight_tp import create_tp_moe_wegiht_obj
+from .fused_moe.fused_moe_weight_ep import FusedMoeWeightEP
diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe_weight_ep.py b/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/fused_moe_weight_ep.py
similarity index 74%
rename from lightllm/common/basemodel/layer_weights/meta_weights/fused_moe_weight_ep.py
rename to lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/fused_moe_weight_ep.py
index 7dc5b5fdc..0923d5dea 100644
--- a/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe_weight_ep.py
+++ b/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/fused_moe_weight_ep.py
@@ -3,7 +3,7 @@
 import threading
 from typing import Optional, Tuple, List, Dict, Any
 from lightllm.utils.dist_utils import get_global_world_size, get_global_rank, get_current_device_id
-from .base_weight import BaseWeight
+from lightllm.common.basemodel.layer_weights.meta_weights.base_weight import BaseWeight
 from lightllm.common.fused_moe.grouped_fused_moe_ep import (
     fused_experts_impl,
     masked_group_gemm,
@@ -23,6 +23,7 @@
 from lightllm.common.basemodel.triton_kernel.redundancy_topk_ids_repair import redundancy_topk_ids_repair
 from lightllm.utils.log_utils import init_logger
 from lightllm.common.triton_utils.autotuner import Autotuner
+from lightllm.common.quantization.quantize_method import WeightPack
 
 
 logger = init_logger(__name__)
@@ -41,6 +42,7 @@ def __init__(
         network_config: Dict[str, Any],
         layer_num: int,
         quant_cfg=None,
+        hidden_size: Optional[int] = None,
     ) -> None:
         super().__init__()
 
@@ -62,6 +64,7 @@ def __init__(
         self.e_score_correction_bias_name = e_score_correction_bias_name
         self.n_routed_experts = n_routed_experts
         self.data_type_ = data_type
+        self.hidden_size = hidden_size
 
         global_world_size = get_global_world_size()
         self.global_rank_ = get_global_rank()
@@ -78,6 +81,7 @@ def __init__(
         assert self.n_routed_experts % global_world_size == 0
         self.ep_n_routed_experts = self.n_routed_experts // global_world_size
         ep_load_expert_num = self.ep_n_routed_experts + self.redundancy_expert_num
+        self.ep_load_expert_num = ep_load_expert_num
         self.experts_up_projs = [None] * ep_load_expert_num
         self.experts_gate_projs = [None] * ep_load_expert_num
         self.experts_up_proj_scales = [None] * ep_load_expert_num
@@ -105,6 +109,51 @@ def __init__(
         # auto update redundancy expert vars
         self.auto_update_redundancy_expert: bool = get_env_start_args().auto_update_redundancy_expert
 
+        # Pre-allocate memory if hidden_size is provided
+        if self.hidden_size is not None:
+            self._create_weight()
+
+    def _create_weight(self):
+        """Pre-allocate GPU memory for fused MoE weights"""
+        if self.hidden_size is None:
+            return
+
+        total_expert_num = self.ep_load_expert_num
+        # We need to determine intermediate size from network config or use a default
+        # This will be updated when first weight is loaded if needed
+        intermediate_size = getattr(self, "intermediate_size", None)
+        if intermediate_size is None:
+            # Default fallback - this will be corrected during load
+            intermediate_size = self.hidden_size * 4
+
+        device_id = get_current_device_id()
+
+        if not self.quantized_weight and self.quant_method is not None:
+            # Quantized weights
+            w1_pack = self.quant_method.create_weight(
+                total_expert_num * intermediate_size * 2, self.hidden_size, dtype=self.data_type_, device_id=device_id
+            )
+            self.w1[0] = w1_pack.weight.view(total_expert_num, intermediate_size * 2, self.hidden_size)
+            self.w1[1] = w1_pack.weight_scale.view(total_expert_num, intermediate_size * 2, self.hidden_size)
+
+            w2_pack = self.quant_method.create_weight(
+                total_expert_num * self.hidden_size, intermediate_size, dtype=self.data_type_, device_id=device_id
+            )
+            self.w2[0] = w2_pack.weight.view(total_expert_num, self.hidden_size, intermediate_size)
+            self.w2[1] = w2_pack.weight_scale.view(total_expert_num, self.hidden_size, intermediate_size)
+        else:
+            # Regular weights
+            self.w1[0] = torch.empty(
+                (total_expert_num, intermediate_size * 2, self.hidden_size),
+                dtype=self.data_type_,
+                device=f"cuda:{device_id}",
+            )
+            self.w2[0] = torch.empty(
+                (total_expert_num, self.hidden_size, intermediate_size),
+                dtype=self.data_type_,
+                device=f"cuda:{device_id}",
+            )
+
     def experts(
         self,
         input_tensor,
@@ -422,12 +471,12 @@ def _fuse(self):
                 inter_shape, hidden_size = self.w2_list[0].shape[0], self.w2_list[0].shape[1]
                 w2 = torch._utils._flatten_dense_tensors(self.w2_list).view(len(self.w2_list), inter_shape, hidden_size)
                 if not self.quantized_weight and self.quant_method is not None:
-                    qw1, qw1_scale, qw1_zero_point = self.quant_method.quantize(w1)
-                    qw2, qw2_scale, qw2_zero_point = self.quant_method.quantize(w2)
-                    self.w1[0] = qw1
-                    self.w1[1] = qw1_scale
-                    self.w2[0] = qw2
-                    self.w2[1] = qw2_scale
+                    qw1_pack = self.quant_method.quantize(w1)
+                    qw2_pack = self.quant_method.quantize(w2)
+                    self.w1[0] = qw1_pack.weight
+                    self.w1[1] = qw1_pack.weight_scale
+                    self.w2[0] = qw2_pack.weight
+                    self.w2[1] = qw2_pack.weight_scale
                 else:
                     self.w1[0] = self._cuda(w1)
                     self.w2[0] = self._cuda(w2)
@@ -469,38 +518,74 @@ def _fuse_weight_scale(self):
 
     def load_hf_weights(self, weights):
         n_expert_ep = self.ep_n_routed_experts
-        # tp to ep here
+
+        # Load bias
         if self.e_score_correction_bias_name in weights:
             self.e_score_correction_bias = self._cuda(weights[self.e_score_correction_bias_name])
 
+        # Get weight shapes from first expert to determine intermediate size
+        first_expert_idx = 0 + n_expert_ep * self.global_rank_
+        w1_weight_name = f"{self.weight_prefix}.{first_expert_idx}.{self.w1_weight_name}.weight"
+        if w1_weight_name in weights:
+            intermediate_size = weights[w1_weight_name].shape[0]
+            self.intermediate_size = intermediate_size
+
+            # Re-create weights with correct size if needed
+            if self.w1[0].shape[1] != intermediate_size * 2:
+                self._create_weight()
+
+        # Load regular experts
         for i_experts_ep in range(n_expert_ep):
             i_experts = i_experts_ep + n_expert_ep * self.global_rank_
-            w1_weight = f"{self.weight_prefix}.{i_experts}.{self.w1_weight_name}.weight"
-            w2_weight = f"{self.weight_prefix}.{i_experts}.{self.w2_weight_name}.weight"
-            w3_weight = f"{self.weight_prefix}.{i_experts}.{self.w3_weight_name}.weight"
-            if w1_weight in weights:
-                self.experts_gate_projs[i_experts_ep] = weights[w1_weight]
-            if w3_weight in weights:
-                self.experts_up_projs[i_experts_ep] = weights[w3_weight]
-            if w2_weight in weights:
-                self.w2_list[i_experts_ep] = weights[w2_weight]
-
-        # Load weight parameters for redundant experts
+            self._copy_expert_weights(i_experts_ep, i_experts, weights)
+
+        # Load redundant experts
         for i, redundant_expert_id in enumerate(self.redundancy_expert_ids):
-            i_experts = redundant_expert_id
-            w1_weight = f"{self.weight_prefix}.{i_experts}.{self.w1_weight_name}.weight"
-            w2_weight = f"{self.weight_prefix}.{i_experts}.{self.w2_weight_name}.weight"
-            w3_weight = f"{self.weight_prefix}.{i_experts}.{self.w3_weight_name}.weight"
-            if w1_weight in weights:
-                self.experts_gate_projs[n_expert_ep + i] = weights[w1_weight]
-            if w3_weight in weights:
-                self.experts_up_projs[n_expert_ep + i] = weights[w3_weight]
-            if w2_weight in weights:
-                self.w2_list[n_expert_ep + i] = weights[w2_weight]
+            self._copy_expert_weights(n_expert_ep + i, redundant_expert_id, weights)
 
         if self.quantized_weight:
-            self._load_weight_scale(weights)
-        self._fuse()
+            self._load_weight_scale_direct(weights)
+
+    def _copy_expert_weights(self, target_idx, expert_id, weights):
+        """Copy a single expert's weights to pre-allocated GPU memory"""
+        w1_weight = f"{self.weight_prefix}.{expert_id}.{self.w1_weight_name}.weight"
+        w2_weight = f"{self.weight_prefix}.{expert_id}.{self.w2_weight_name}.weight"
+        w3_weight = f"{self.weight_prefix}.{expert_id}.{self.w3_weight_name}.weight"
+
+        intermediate_size = self.intermediate_size
+
+        if w1_weight in weights and w3_weight in weights:
+            # Combine gate and up projections into w1
+            gate_weight = weights[w1_weight]  # [intermediate_size, hidden_size]
+            up_weight = weights[w3_weight]  # [intermediate_size, hidden_size]
+
+            # Copy to pre-allocated memory
+            if not self.quantized_weight and self.quant_method is not None:
+                # Quantized path
+                combined_cpu = torch.empty((intermediate_size * 2, self.hidden_size), dtype=gate_weight.dtype)
+                combined_cpu[:intermediate_size, :] = gate_weight
+                combined_cpu[intermediate_size:, :] = up_weight
+                quantized_pack = self.quant_method.quantize(combined_cpu)
+                self.w1[0][target_idx].copy_(quantized_pack.weight.view(intermediate_size * 2, self.hidden_size))
+                if quantized_pack.weight_scale is not None:
+                    self.w1[1][target_idx].copy_(
+                        quantized_pack.weight_scale.view(intermediate_size * 2, self.hidden_size)
+                    )
+            else:
+                # Regular path
+                self.w1[0][target_idx, :intermediate_size, :].copy_(gate_weight)
+                self.w1[0][target_idx, intermediate_size:, :].copy_(up_weight)
+
+        if w2_weight in weights:
+            # Copy w2 (down projection)
+            w2_weight_tensor = weights[w2_weight]  # [hidden_size, intermediate_size] - already the correct shape
+            if not self.quantized_weight and self.quant_method is not None:
+                quantized_pack = self.quant_method.quantize(w2_weight_tensor)
+                self.w2[0][target_idx].copy_(quantized_pack.weight)
+                if quantized_pack.weight_scale is not None:
+                    self.w2[1][target_idx].copy_(quantized_pack.weight_scale)
+            else:
+                self.w2[0][target_idx].copy_(w2_weight_tensor)
 
     def _load_weight_scale(self, weights: Dict[str, torch.Tensor]) -> None:
         n_expert_ep = self.ep_n_routed_experts
@@ -530,6 +615,41 @@ def _load_weight_scale(self, weights: Dict[str, torch.Tensor]) -> None:
             if w2_scale in weights:
                 self.w2_scale_list[n_expert_ep + i] = weights[w2_scale]
 
+    def _load_weight_scale_direct(self, weights: Dict[str, torch.Tensor]) -> None:
+        """Load weight scales directly to pre-allocated GPU memory"""
+        n_expert_ep = self.ep_n_routed_experts
+
+        # Load regular expert scales
+        for i_experts_ep in range(n_expert_ep):
+            i_experts = i_experts_ep + n_expert_ep * self.global_rank_
+            self._copy_expert_scales(i_experts_ep, i_experts, weights)
+
+        # Load redundant expert scales
+        for i, redundant_expert_id in enumerate(self.redundancy_expert_ids):
+            self._copy_expert_scales(n_expert_ep + i, redundant_expert_id, weights)
+
+    def _copy_expert_scales(self, target_idx, expert_id, weights):
+        """Copy a single expert's weight scales to pre-allocated GPU memory"""
+        w1_scale = f"{self.weight_prefix}.{expert_id}.{self.w1_weight_name}.{self.weight_scale_suffix}"
+        w2_scale = f"{self.weight_prefix}.{expert_id}.{self.w2_weight_name}.{self.weight_scale_suffix}"
+        w3_scale = f"{self.weight_prefix}.{expert_id}.{self.w3_weight_name}.{self.weight_scale_suffix}"
+
+        intermediate_size = self.intermediate_size
+
+        if w1_scale in weights and w3_scale in weights:
+            # Combine gate and up projection scales into w1 scale
+            gate_scale = weights[w1_scale]  # [intermediate_size, hidden_size]
+            up_scale = weights[w3_scale]  # [intermediate_size, hidden_size]
+
+            # Copy to pre-allocated memory
+            self.w1[1][target_idx, :intermediate_size, :].copy_(gate_scale)
+            self.w1[1][target_idx, intermediate_size:, :].copy_(up_scale)
+
+        if w2_scale in weights:
+            # Copy w2 scale (down projection)
+            w2_scale_tensor = weights[w2_scale]  # [hidden_size, intermediate_size]
+            self.w2[1][target_idx].copy_(w2_scale_tensor)
+
     def _cuda(self, cpu_tensor):
         device_id = get_current_device_id()
         if self.quantized_weight:
diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe_weight_ep_redundancy.py b/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/fused_moe_weight_ep_redundancy.py
similarity index 96%
rename from lightllm/common/basemodel/layer_weights/meta_weights/fused_moe_weight_ep_redundancy.py
rename to lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/fused_moe_weight_ep_redundancy.py
index b53200d4c..933a94f78 100644
--- a/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe_weight_ep_redundancy.py
+++ b/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/fused_moe_weight_ep_redundancy.py
@@ -102,12 +102,12 @@ def _fuse(self):
                 inter_shape, hidden_size = self.w2_list[0].shape[0], self.w2_list[0].shape[1]
                 w2 = torch._utils._flatten_dense_tensors(self.w2_list).view(len(self.w2_list), inter_shape, hidden_size)
                 if not self._ep_w.quantized_weight and self._ep_w.quant_method is not None:
-                    qw1, qw1_scale, qw1_zero_point = self._ep_w.quant_method.quantize(w1)
-                    qw2, qw2_scale, qw2_zero_point = self._ep_w.quant_method.quantize(w2)
-                    self.w1[0] = qw1
-                    self.w1[1] = qw1_scale
-                    self.w2[0] = qw2
-                    self.w2[1] = qw2_scale
+                    qw1_pack = self._ep_w.quant_method.quantize(w1)
+                    qw2_pack = self._ep_w.quant_method.quantize(w2)
+                    self.w1[0] = qw1_pack.weight
+                    self.w1[1] = qw1_pack.weight_scale
+                    self.w2[0] = qw2_pack.weight
+                    self.w2[1] = qw2_pack.weight_scale
                 else:
                     self.w1[0] = w1
                     self.w2[0] = w2
diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/fused_moe_weight_tp.py b/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/fused_moe_weight_tp.py
new file mode 100644
index 000000000..bf7b218b7
--- /dev/null
+++ b/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/fused_moe_weight_tp.py
@@ -0,0 +1,325 @@
+import os
+import torch
+import threading
+from typing import Tuple, List, Dict, Any, Union, Callable
+from lightllm.common.basemodel.layer_weights.meta_weights.base_weight import BaseWeight
+from lightllm.utils.dist_utils import get_current_rank_in_dp, get_current_device_id, get_dp_world_size
+from lightllm.common.quantization import Quantcfg
+from lightllm.common.quantization.quantize_method import WeightPack
+from lightllm.common.basemodel.layer_weights.meta_weights.mm_weight.mm_slicer import (
+    get_row_slice_mixin,
+    get_col_slice_mixin,
+)
+
+
+def create_tp_moe_wegiht_obj(
+    gate_proj_name: str,
+    down_proj_name: str,
+    up_proj_name: str,
+    e_score_correction_bias_name: str,
+    weight_prefix: str,
+    n_routed_experts: int,
+    num_fused_shared_experts: int,
+    split_inter_size: int,
+    data_type: torch.dtype,
+    network_config: Dict[str, Any],
+    layer_num: int,
+    quant_cfg: Quantcfg = None,
+) -> Union["FusedMoeWeightTP", "FusedAWQMARLINMoeWeightTP"]:
+    quant_method = quant_cfg.get_quant_method(layer_num, "fused_moe")
+    if quant_method is not None and quant_method.method_name == "awq_marlin":
+        return FusedAWQMARLINMoeWeightTP(
+            gate_proj_name=gate_proj_name,
+            down_proj_name=down_proj_name,
+            up_proj_name=up_proj_name,
+            e_score_correction_bias_name=e_score_correction_bias_name,
+            weight_prefix=weight_prefix,
+            n_routed_experts=n_routed_experts,
+            num_fused_shared_experts=num_fused_shared_experts,
+            split_inter_size=split_inter_size,
+            data_type=data_type,
+            network_config=network_config,
+            layer_num=layer_num,
+            quant_cfg=quant_cfg,
+        )
+    else:
+        return FusedMoeWeightTP(
+            gate_proj_name=gate_proj_name,
+            down_proj_name=down_proj_name,
+            up_proj_name=up_proj_name,
+            e_score_correction_bias_name=e_score_correction_bias_name,
+            weight_prefix=weight_prefix,
+            n_routed_experts=n_routed_experts,
+            num_fused_shared_experts=num_fused_shared_experts,
+            split_inter_size=split_inter_size,
+            data_type=data_type,
+            network_config=network_config,
+            layer_num=layer_num,
+            quant_cfg=quant_cfg,
+        )
+
+
+class FusedMoeWeightTP(BaseWeight):
+    def __init__(
+        self,
+        gate_proj_name: str,
+        down_proj_name: str,
+        up_proj_name: str,
+        e_score_correction_bias_name: str,
+        weight_prefix: str,
+        n_routed_experts: int,
+        num_fused_shared_experts: int,
+        split_inter_size: int,
+        data_type: torch.dtype,
+        network_config: Dict[str, Any],
+        layer_num: int,
+        quant_cfg: Quantcfg = None,
+    ) -> None:
+        super().__init__()
+        self.quant_method = quant_cfg.get_quant_method(layer_num, "fused_moe")
+        self.quantized_weight = quant_cfg.quantized_weight
+        if self.quant_method.method_name != "none":
+            self.weight_scale_suffix = self.quant_method.weight_scale_suffix
+
+        self.w1_weight_name = gate_proj_name
+        self.w2_weight_name = down_proj_name
+        self.w3_weight_name = up_proj_name
+
+        self.e_score_correction_bias_name = e_score_correction_bias_name
+        self.weight_prefix = weight_prefix
+        assert num_fused_shared_experts in [0, 1], "num_fused_shared_experts can only support 0 or 1 now."
+        self.n_routed_experts = n_routed_experts + num_fused_shared_experts
+        self.num_fused_shared_experts = num_fused_shared_experts
+        self.routed_scaling_factor = network_config.get("routed_scaling_factor", 1.0)
+        self.split_inter_size = split_inter_size
+        self.data_type_ = data_type
+        self.hidden_size = network_config.get("hidden_size")
+        self.tp_rank_ = get_current_rank_in_dp()
+        self.e_score_correction_bias = None
+        self.scoring_func = network_config.get("scoring_func", "softmax")
+        self.row_slicer = get_row_slice_mixin(
+            self.quant_method.method_name, tp_rank=self.tp_rank_, tp_world_size=get_dp_world_size()
+        )
+        self.col_slicer = get_col_slice_mixin(
+            self.quant_method.method_name, tp_rank=self.tp_rank_, tp_world_size=get_dp_world_size()
+        )
+        self._create_weight()
+
+    def _create_weight(self):
+        total_expert_num = self.n_routed_experts
+        intermediate_size = self.split_inter_size
+        device_id = get_current_device_id()
+
+        # Create e_score_correction_bias
+        if self.e_score_correction_bias is not None:
+            self.e_score_correction_bias = torch.empty(
+                (total_expert_num,),
+                dtype=self.data_type_,
+                device=f"cuda:{device_id}",
+            )
+
+        self.w13: WeightPack = self.quant_method.create_weight(
+            out_dim=intermediate_size * 2,
+            in_dim=self.hidden_size,
+            dtype=self.data_type_,
+            device_id=device_id,
+            num_experts=total_expert_num,
+        )
+        self.w2: WeightPack = self.quant_method.create_weight(
+            out_dim=self.hidden_size,
+            in_dim=intermediate_size,
+            dtype=self.data_type_,
+            device_id=device_id,
+            num_experts=total_expert_num,
+        )
+
+    def experts(self, input_tensor, router_logits, top_k, renormalize, use_grouped_topk, topk_group, num_expert_group):
+        from lightllm.common.fused_moe.topk_select import select_experts
+
+        topk_weights, topk_ids = select_experts(
+            hidden_states=input_tensor,
+            router_logits=router_logits,
+            correction_bias=self.e_score_correction_bias,
+            use_grouped_topk=use_grouped_topk,
+            top_k=top_k,
+            renormalize=renormalize,
+            topk_group=topk_group,
+            num_expert_group=num_expert_group,
+            scoring_func=self.scoring_func,
+        )
+        topk_weights.mul_(self.routed_scaling_factor)
+        if self.num_fused_shared_experts > 0:
+            pad_topk_ids = (
+                torch.arange(
+                    start=self.n_routed_experts - self.num_fused_shared_experts,
+                    end=self.n_routed_experts,
+                    step=1,
+                    dtype=topk_ids.dtype,
+                    device="cuda",
+                )
+                .view(1, self.num_fused_shared_experts)
+                .repeat(topk_ids.shape[0], 1)
+            )
+            pad_topk_weights = torch.full(
+                (topk_weights.shape[0], self.num_fused_shared_experts),
+                fill_value=1.0,
+                device="cuda",
+                dtype=topk_weights.dtype,
+            )
+
+            topk_ids = torch.cat([topk_ids, pad_topk_ids], dim=1)
+            topk_weights = torch.cat([topk_weights, pad_topk_weights], dim=1)
+
+        w13, w13_scale = self.w13.weight, self.w13.weight_scale
+        w2, w2_scale = self.w2.weight, self.w2.weight_scale
+        use_fp8_w8a8 = self.quant_method.method_name != "none"
+
+        from lightllm.common.fused_moe.grouped_fused_moe import fused_experts
+
+        fused_experts(
+            hidden_states=input_tensor,
+            w1=w13,
+            w2=w2,
+            topk_weights=topk_weights,
+            topk_ids=topk_ids,
+            inplace=True,
+            use_fp8_w8a8=use_fp8_w8a8,
+            w1_scale=w13_scale,
+            w2_scale=w2_scale,
+        )
+        return
+
+    def _cuda(self, cpu_tensor):
+        device_id = get_current_device_id()
+        if self.quantized_weight:
+            return cpu_tensor.cuda(device_id)
+        return cpu_tensor.cuda(device_id)
+
+    def verify_load(self):
+        return True
+
+    def load_hf_weights(self, weights):
+        # Load bias
+        if self.e_score_correction_bias_name in weights:
+            self.e_score_correction_bias.copy_(weights[self.e_score_correction_bias_name])
+
+        # Load each expert with TP slicing
+        for i_experts in range(self.n_routed_experts):
+            self._load_expert(i_experts, weights, type="weight", suffix=self.quant_method.weight_suffix)
+            if self.w13.weight_scale is not None:
+                self._load_expert(i_experts, weights, type="weight_scale", suffix=self.quant_method.weight_scale_suffix)
+            if self.w13.weight_zero_point is not None:
+                self._load_expert(
+                    i_experts, weights, type="weight_zero_point", suffix=self.quant_method.weight_zero_point_suffix
+                )
+
+    def _load_weight_func(self, weight: torch.Tensor, weight_pack: WeightPack, start_idx: int = 0):
+        if self.quant_method.weight_need_quanted(weight):
+            self.quant_method.quantize(weight, weight_pack, start_idx)
+        else:
+            self.quant_method.load_weight(weight, weight_pack, start_idx)
+
+    def _load_expert(self, expert_idx, weights, type: str, suffix: str = "weight"):
+        w1_weight = f"{self.weight_prefix}.{expert_idx}.{self.w1_weight_name}.{suffix}"
+        w2_weight = f"{self.weight_prefix}.{expert_idx}.{self.w2_weight_name}.{suffix}"
+        w3_weight = f"{self.weight_prefix}.{expert_idx}.{self.w3_weight_name}.{suffix}"
+        intermediate_size = self.split_inter_size
+        load_func, slice_func = self._get_load_and_slice_func(type, is_row=True)
+        if w1_weight in weights:
+            load_func(slice_func(weights[w1_weight]), self.w13.get_expert(expert_idx), start_idx=0)
+        if w3_weight in weights:
+            load_func(slice_func(weights[w3_weight]), self.w13.get_expert(expert_idx), start_idx=intermediate_size)
+
+        load_func, slice_func = self._get_load_and_slice_func(type, is_row=False)
+        if w2_weight in weights:
+            load_func(slice_func(weights[w2_weight]), self.w2.get_expert(expert_idx), start_idx=0)
+
+    def _get_load_and_slice_func(self, type: str, is_row: bool = True):
+        if is_row:
+            slicer = self.row_slicer
+        else:
+            slicer = self.col_slicer
+        if type == "weight":
+            return self._load_weight_func, slicer._slice_weight
+        elif type == "weight_scale":
+            return getattr(self.quant_method, "load_weight_scale"), slicer._slice_weight_scale
+        elif type == "weight_zero_point":
+            return getattr(self.quant_method, "load_weight_zero_point"), slicer._slice_weight_zero_point
+
+
+class FusedAWQMARLINMoeWeightTP(FusedMoeWeightTP):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        from lightllm.utils.vllm_utils import HAS_VLLM, vllm_ops
+
+        assert HAS_VLLM, "moe awq marlin quantization requires kernels of vllm"
+        from vllm.model_executor.layers.quantization.utils.marlin_utils import (
+            marlin_make_workspace_new,
+        )
+
+        self.workspace = marlin_make_workspace_new(self.w13.weight.device, 4)
+
+    def experts(self, input_tensor, router_logits, top_k, renormalize, use_grouped_topk, topk_group, num_expert_group):
+        from lightllm.common.fused_moe.topk_select import select_experts
+
+        topk_weights, topk_ids = select_experts(
+            hidden_states=input_tensor,
+            router_logits=router_logits,
+            correction_bias=self.e_score_correction_bias,
+            use_grouped_topk=use_grouped_topk,
+            top_k=top_k,
+            renormalize=renormalize,
+            topk_group=topk_group,
+            num_expert_group=num_expert_group,
+            scoring_func=self.scoring_func,
+        )
+        topk_weights.mul_(self.routed_scaling_factor)
+        if self.num_fused_shared_experts > 0:
+            pad_topk_ids = (
+                torch.arange(
+                    start=self.n_routed_experts - self.num_fused_shared_experts,
+                    end=self.n_routed_experts,
+                    step=1,
+                    dtype=topk_ids.dtype,
+                    device="cuda",
+                )
+                .view(1, self.num_fused_shared_experts)
+                .repeat(topk_ids.shape[0], 1)
+            )
+            pad_topk_weights = torch.full(
+                (topk_weights.shape[0], self.num_fused_shared_experts),
+                fill_value=1.0,
+                device="cuda",
+                dtype=topk_weights.dtype,
+            )
+
+            topk_ids = torch.cat([topk_ids, pad_topk_ids], dim=1)
+            topk_weights = torch.cat([topk_weights, pad_topk_weights], dim=1)
+
+        w1, w1_scale, w1_zero_point = self.w13.weight, self.w13.weight_scale, self.w13.weight_zero_point
+        w2, w2_scale, w2_zero_point = self.w2.weight, self.w2.weight_scale, self.w2.weight_zero_point
+
+        from vllm.model_executor.layers.fused_moe.fused_marlin_moe import fused_marlin_moe
+
+        fused_marlin_moe(
+            input_tensor,
+            w1,
+            w2,
+            None,
+            None,
+            w1_scale,
+            w2_scale,
+            router_logits,
+            topk_weights,
+            topk_ids,
+            quant_type_id=self.quant_method.vllm_quant_type.id,
+            apply_router_weight_on_input=False,
+            global_num_experts=-1,
+            expert_map=None,
+            w1_zeros=w1_zero_point,
+            w2_zeros=w2_zero_point,
+            workspace=self.workspace,
+            inplace=True,
+        )
+
+        return
diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/gpt_oss_fused_moe_weight_tp.py b/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/gpt_oss_fused_moe_weight_tp.py
similarity index 99%
rename from lightllm/common/basemodel/layer_weights/meta_weights/gpt_oss_fused_moe_weight_tp.py
rename to lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/gpt_oss_fused_moe_weight_tp.py
index df72cc620..9d79ff7c2 100644
--- a/lightllm/common/basemodel/layer_weights/meta_weights/gpt_oss_fused_moe_weight_tp.py
+++ b/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/gpt_oss_fused_moe_weight_tp.py
@@ -3,7 +3,7 @@
 import threading
 from typing import Optional, Tuple, List, Dict, Any
 
-from lightllm.common.basemodel.layer_weights.meta_weights.fused_moe_weight_tp import FusedMoeWeightTP
+from lightllm.common.basemodel.layer_weights.meta_weights.fused_moe.fused_moe_weight_tp import FusedMoeWeightTP
 from lightllm.utils.dist_utils import get_current_rank_in_dp, get_current_device_id
 from lightllm.common.quantization import Quantcfg
 from lightllm.utils.log_utils import init_logger
diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/__init__.py b/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/__init__.py
index 63605b177..34d989b01 100644
--- a/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/__init__.py
+++ b/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/__init__.py
@@ -1,10 +1,5 @@
 from .mm_weight import (
-    MMWeightPack,
     MMWeightTpl,
 )
-from .mm_factory import (
-    MMWeight,
-    ROWMMWeight,
-    ROWBMMWeight,
-    COLMMWeight,
-)
+from .rowmm_weight import ROWMMWeight, ROWBMMWeight
+from .colmm_weight import COLMMWeight
diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/colmm_weight.py b/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/colmm_weight.py
index 281f30f02..bf73b9ad8 100644
--- a/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/colmm_weight.py
+++ b/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/colmm_weight.py
@@ -1,19 +1,19 @@
 import torch
 from lightllm.common.basemodel.layer_weights.meta_weights.mm_weight.mm_weight import (
     MMWeightTpl,
-    DeepGemmFP8W8A8B128MMWeight,
-    AWQMMWeightTpl,
 )
 from lightllm.common.quantization import Quantcfg
 from lightllm.utils.dist_utils import get_current_device_id
 from lightllm.common.quantization.quantize_method import QuantizationMethod
 from typing import Dict, List, Optional, Union
-from .mm_slicer import ColSliceMixin, QuantizedColSliceMixin, AwqQuantizedColSliceMixin
+from .mm_slicer import get_col_slice_mixin
 
 
-class StandardCOLMMWeight(MMWeightTpl):
+class COLMMWeight(MMWeightTpl):
     def __init__(
         self,
+        in_dim: int,
+        out_dims: Optional[Union[int, List[int]]],
         weight_names: Union[str, List[str]],
         data_type: torch.dtype,
         bias_names: Optional[Union[str, List[str]]] = None,
@@ -22,6 +22,8 @@ def __init__(
         tp_world_size: int = None,
     ) -> None:
         super().__init__(
+            in_dim=in_dim,
+            out_dims=out_dims,
             weight_names=weight_names,
             data_type=data_type,
             bias_names=bias_names,
@@ -29,74 +31,6 @@ def __init__(
             tp_rank=tp_rank,
             tp_world_size=tp_world_size,
         )
-        self.param_slicer = ColSliceMixin(tp_rank=tp_rank, tp_world_size=tp_world_size)
-
-
-class DeepGemmFP8W8A8B128COLMMWeight(DeepGemmFP8W8A8B128MMWeight):
-    def __init__(
-        self,
-        weight_names: Union[str, List[str]],
-        data_type: torch.dtype,
-        bias_names: Optional[Union[str, List[str]]] = None,
-        quant_method: QuantizationMethod = None,
-        tp_rank: int = None,
-        tp_world_size: int = None,
-    ) -> None:
-        super().__init__(
-            weight_names=weight_names,
-            data_type=data_type,
-            bias_names=bias_names,
-            quant_method=quant_method,
-            tp_rank=tp_rank,
-            tp_world_size=tp_world_size,
-        )
-        self.param_slicer = QuantizedColSliceMixin(tp_rank=tp_rank, tp_world_size=tp_world_size)
-
-
-class AWQCOLMMWeight(AWQMMWeightTpl):
-    def __init__(
-        self,
-        weight_names: Union[str, List[str]],
-        data_type: torch.dtype,
-        bias_names: Optional[Union[str, List[str]]] = None,
-        quant_method: QuantizationMethod = None,
-        tp_rank: int = None,
-        tp_world_size: int = None,
-    ) -> None:
-        super().__init__(
-            weight_names=weight_names,
-            data_type=data_type,
-            bias_names=bias_names,
-            quant_method=quant_method,
-            tp_rank=tp_rank,
-            tp_world_size=tp_world_size,
+        self.param_slicer = get_col_slice_mixin(
+            self.quant_method.method_name, tp_rank=tp_rank, tp_world_size=tp_world_size
         )
-        # 注意这里不是错误，因为awq的weight是按inxout存的
-        self.param_slicer = AwqQuantizedColSliceMixin(tp_rank=tp_rank, tp_world_size=tp_world_size)
-
-
-class AWQMARLINCOLMMWeight(AWQCOLMMWeight):
-    def __init__(
-        self,
-        weight_names: Union[str, List[str]],
-        data_type: torch.dtype,
-        bias_names: Optional[Union[str, List[str]]] = None,
-        quant_method: QuantizationMethod = None,
-        tp_rank: int = None,
-        tp_world_size: int = None,
-    ) -> None:
-        super().__init__(
-            weight_names=weight_names,
-            data_type=data_type,
-            bias_names=bias_names,
-            quant_method=quant_method,
-            tp_rank=tp_rank,
-            tp_world_size=tp_world_size,
-        )
-
-
-COLMM_WEIGHT_CLS_MAP = {
-    "deepgemm-fp8w8a8-b128": DeepGemmFP8W8A8B128COLMMWeight,
-    "awq": AWQCOLMMWeight,
-    "awq_marlin": AWQMARLINCOLMMWeight,
-}
diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/mm_factory.py b/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/mm_factory.py
deleted file mode 100644
index 464de8441..000000000
--- a/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/mm_factory.py
+++ /dev/null
@@ -1,90 +0,0 @@
-from lightllm.common.quantization import Quantcfg
-from lightllm.common.quantization.quantize_method import QuantizationMethod
-from typing import Type, Union, Dict
-from lightllm.common.basemodel.layer_weights.meta_weights.mm_weight.mm_weight import (
-    MMWeightTpl,
-    BMMWeightTpl,
-)
-from lightllm.common.basemodel.layer_weights.meta_weights.mm_weight.rowmm_weight import (
-    StandardROWMMWeight,
-    UnquantizedROWBMMWeight,
-    ROWMM_WEIGHT_CLS_MAP,
-)
-from lightllm.common.basemodel.layer_weights.meta_weights.mm_weight.colmm_weight import (
-    StandardCOLMMWeight,
-    COLMM_WEIGHT_CLS_MAP,
-)
-
-
-class MMWeight:
-    def __new__(cls, **kwargs):
-        """
-        weight_names,
-        data_type,
-        bias_names,
-        quant_cfg,
-        layer_num,
-        name,
-        tp_rank,
-        tp_world_size,
-        ...
-        该类主要是通过重载 __new__ 为对应的mm权重绑定量化方法，其他参数都是透传。
-        """
-
-        quant_cfg = kwargs.pop("quant_cfg", None)
-        layer_num_ = kwargs.pop("layer_num", None)
-        name = kwargs.pop("name", None)
-        quant_method, quantized_weight = cls._get_quant_method(quant_cfg, layer_num_, name)
-        # quantized_weight 本身是用来标识权重本身在文件中是否是以量化后的形式存储，
-        # 现在不再使用该参数，是否量化由后续的加载过程自动识别。
-        kwargs["quant_method"] = quant_method
-        mmcls = cls._get_mmcls(quant_method)
-        return mmcls(**kwargs)
-
-    @classmethod
-    def _get_quant_method(cls, quant_cfg: Quantcfg, layer_num_: int, name: str) -> QuantizationMethod:
-        if quant_cfg is None:
-            return None, False
-        quant_method: QuantizationMethod = quant_cfg.get_quant_method(layer_num_, name)
-        if quant_method is None:
-            return None, False
-        quant_method.hf_quantization_config = quant_cfg.hf_quantization_config
-        quantized_weight = quant_cfg.quantized_weight
-        return quant_method, quantized_weight
-
-    @classmethod
-    def _get_mmcls(cls, quant_method: QuantizationMethod) -> Type[Union[MMWeightTpl, BMMWeightTpl]]:
-        raise NotImplementedError("Subclasses must implement _get_mmcls method")
-
-
-class ROWMMWeight(MMWeight):
-    @classmethod
-    def _get_mmcls(cls, quant_method: QuantizationMethod):
-        if quant_method is None:
-            return StandardROWMMWeight
-
-        return ROWMM_WEIGHT_CLS_MAP.get(
-            quant_method.method_name,
-            StandardROWMMWeight,
-        )
-
-
-class ROWBMMWeight(MMWeight):
-    @classmethod
-    def _get_mmcls(cls, quant_method: QuantizationMethod):
-        if quant_method is None:
-            return UnquantizedROWBMMWeight
-        else:
-            # TODO: Implement more quantization weight
-            raise NotImplementedError("ROWBMMWeight is not implemented")
-
-
-class COLMMWeight(MMWeight):
-    @classmethod
-    def _get_mmcls(cls, quant_method: QuantizationMethod):
-        if quant_method is None:
-            return StandardCOLMMWeight
-        return COLMM_WEIGHT_CLS_MAP.get(
-            quant_method.method_name,
-            StandardCOLMMWeight,
-        )
diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/mm_slicer.py b/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/mm_slicer.py
index e3ef5b0ea..e2830ab61 100644
--- a/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/mm_slicer.py
+++ b/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/mm_slicer.py
@@ -132,3 +132,21 @@ def __init__(self, tp_rank: int = None, tp_world_size: int = None):
 
     def _slice_bias(self, bias: torch.Tensor) -> torch.Tensor:
         return bias / self.tp_world_size_
+
+
+def get_row_slice_mixin(quant_method_name: str, tp_rank: int = None, tp_world_size: int = None) -> SliceMixinTpl:
+    if quant_method_name.startswith("awq"):
+        return AwqQuantizedRowSliceMixin(tp_rank, tp_world_size)
+    elif quant_method_name == "none":
+        return RowSliceMixin(tp_rank, tp_world_size)
+    else:
+        return QuantizedRowSliceMixin(tp_rank, tp_world_size)
+
+
+def get_col_slice_mixin(quant_method_name: str, tp_rank: int = None, tp_world_size: int = None) -> SliceMixinTpl:
+    if quant_method_name.startswith("awq"):
+        return AwqQuantizedColSliceMixin(tp_rank, tp_world_size)
+    elif quant_method_name == "none":
+        return ColSliceMixin(tp_rank, tp_world_size)
+    else:
+        return QuantizedColSliceMixin(tp_rank, tp_world_size)
diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/mm_weight.py b/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/mm_weight.py
index 7391454da..92236b798 100644
--- a/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/mm_weight.py
+++ b/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/mm_weight.py
@@ -5,9 +5,10 @@
 from dataclasses import dataclass
 from typing import Optional, Tuple, List, Dict, Union, Type
 from lightllm.common.basemodel.layer_infer.cache_tensor_manager import g_cache_manager
-from lightllm.common.quantization.quantize_method import QuantizationMethod
+from lightllm.common.quantization.quantize_method import QuantizationMethod, WeightPack
 from lightllm.common.basemodel.layer_weights.meta_weights.base_weight import BaseWeightTpl
 from lightllm.common.quantization import Quantcfg
+from lightllm.common.quantization.no_quant import NoQuantization
 from lightllm.utils.dist_utils import get_current_device_id
 from lightllm.utils.log_utils import init_logger
 from .mm_slicer import SliceMixinTpl
@@ -15,53 +16,11 @@
 logger = init_logger(__name__)
 
 
-@dataclass
-class MMWeightPack:
-    weight: Optional[torch.Tensor] = None
-    bias: Optional[torch.Tensor] = None
-    weight_scale: Optional[torch.Tensor] = None
-    weight_zero_point: Optional[torch.Tensor] = None
-
-    has_bias: bool = False
-    has_weight_scale: bool = False
-    has_weight_zero_point: bool = False
-
-    def is_ready(self) -> bool:
-        return (
-            self.weight is not None
-            and (not self.has_bias or (self.has_bias and self.bias is not None))
-            and (not self.has_weight_scale or (self.has_weight_scale and self.weight_scale is not None))
-            and (not self.has_weight_zero_point or (self.has_weight_zero_point and self.weight_zero_point is not None))
-        )
-
-    def ready_for_fused_merge(self) -> bool:
-        """
-        判断权重是否满足可以和其他权重进行融合cat的条件，因为可能权重是量化和非量化后的权重，所以复杂一些。
-        """
-        weight_ready = self.weight is not None and self.weight.dtype in [
-            torch.bfloat16,
-            torch.float16,
-            torch.float32,
-            torch.float64,
-        ]
-        bias_ready = (self.has_bias and self.bias is not None) or (not self.has_bias)
-        if weight_ready and bias_ready:
-            return True
-        else:
-            return self.is_ready()
-
-    def is_load_finished(self):
-        return (
-            (self.is_ready() and self.weight.is_cuda)
-            and ((self.has_bias and self.bias.is_cuda) or (not self.has_bias))
-            and ((self.has_weight_scale and self.weight_scale.is_cuda) or (not self.has_weight_scale))
-            and ((self.has_weight_zero_point and self.weight_zero_point.is_cuda) or (not self.has_weight_zero_point))
-        )
-
-
 class MMWeightTpl(BaseWeightTpl):
     def __init__(
         self,
+        in_dim: int,
+        out_dims: Optional[Union[int, List[int]]],
         weight_names: Union[str, List[str]],
         bias_names: Optional[Union[str, List[str]]],
         data_type: torch.dtype,
@@ -72,6 +31,14 @@ def __init__(
         super().__init__(tp_rank, tp_world_size, data_type)
         self.lock = threading.Lock()
 
+        self.in_dim = in_dim
+        if isinstance(out_dims, int):
+            out_dims = [out_dims]
+        self.out_dims = out_dims
+        self.cusum_out_dims = [0]
+        for out_dim in out_dims[:-1]:
+            self.cusum_out_dims.append(self.cusum_out_dims[-1] + out_dim)
+
         if isinstance(weight_names, str):
             weight_names = [weight_names]
         if isinstance(bias_names, str):
@@ -82,60 +49,29 @@ def __init__(
             if bias_names[0] is None:
                 bias_names = None
 
-        if quant_method is not None:
-            has_weight_scale = quant_method.has_weight_scale
-            has_weight_zero_point = quant_method.has_weight_zero_point
-        else:
-            has_weight_scale = False
-            has_weight_zero_point = False
-
         # 同时存在 weight_names 和 quanted_weight_names 是为了兼容在线和离线两种加载方案
         self.weight_names = weight_names
-
         self.bias_names = bias_names
-        has_bias = self.bias_names is not None
-
-        self.gen_weight_quant_param_names(quant_method=quant_method)
-        self.quant_method = quant_method
-        self.sub_child_mm_params: List[MMWeightPack] = [
-            MMWeightPack(
-                has_bias=has_bias,
-                has_weight_scale=has_weight_scale,
-                has_weight_zero_point=has_weight_zero_point,
-            )
-            for _ in range(len(weight_names))
-        ]
-        self.mm_param: MMWeightPack = MMWeightPack(
-            has_bias=has_bias,
-            has_weight_scale=has_weight_scale,
-            has_weight_zero_point=has_weight_zero_point,
-        )
+        self.quant_method: QuantizationMethod = NoQuantization() if quant_method is None else quant_method
         self.param_slicer: SliceMixinTpl = None
+        self._create_weight()
+        self.gen_weight_quant_param_names(quant_method=quant_method)
 
-        self.weight_fused_dim = 0
-        self.bias_fused_dim = 0
-        self.weight_scale_and_zero_point_fused_dim = 0
-
-        self.load_finished: bool = False
+    def _create_weight(self):
+        self.bias = None
+        if self.bias_names is not None:
+            self.bias = torch.empty(self.cusum_out_dims[-1], dtype=self.data_type_).cuda(get_current_device_id())
+        self.mm_param: WeightPack = self.quant_method.create_weight(
+            in_dim=self.in_dim, out_dim=sum(self.out_dims), dtype=self.data_type_, device_id=get_current_device_id()
+        )
+        return
 
     def mm(
         self, input_tensor: torch.Tensor, out: Optional[torch.Tensor] = None, use_custom_tensor_mananger: bool = True
     ) -> torch.Tensor:
-        if self.quant_method is not None:
-            return self.quant_method.apply(
-                input_tensor, self.mm_param, out, use_custom_tensor_mananger=use_custom_tensor_mananger
-            )
-        if out is None:
-            shape = (input_tensor.shape[0], self.mm_param.weight.shape[1])
-            dtype = input_tensor.dtype
-            device = input_tensor.device
-            if use_custom_tensor_mananger:
-                out = g_cache_manager.alloc_tensor(shape, dtype, device=device)
-            else:
-                out = torch.empty(shape, dtype=dtype, device=device)
-        if self.mm_param.bias is None:
-            return torch.mm(input_tensor, self.mm_param.weight, out=out)
-        return torch.addmm(self.mm_param.bias, input_tensor, self.mm_param.weight, out=out)
+        return self.quant_method.apply(
+            input_tensor, self.mm_param, out, use_custom_tensor_mananger=use_custom_tensor_mananger, bias=self.bias
+        )
 
     def gen_weight_quant_param_names(self, quant_method: Optional[QuantizationMethod]):
         if quant_method is None:
@@ -176,8 +112,6 @@ def gen_weight_quant_param_names(self, quant_method: Optional[QuantizationMethod
         return
 
     def load_hf_weights(self, weights):
-        if self.mm_param.is_load_finished():
-            return
 
         for sub_child_index, param_name in enumerate(self.weight_names):
             self._load_weight(param_name=param_name, weights=weights, sub_child_index=sub_child_index)
@@ -196,51 +130,8 @@ def load_hf_weights(self, weights):
             for sub_child_index, param_name in enumerate(self.weight_zero_point_names):
                 self._load_weight_zero_point(param_name=param_name, weights=weights, sub_child_index=sub_child_index)
 
-        with self.lock:
-            # 如果需要fused的请求，全部ok了以后进行merge操作。, all([]) 竟然返回是True, 需要len(self.sub_child_mm_params) > 0 的额外判断。
-            if len(self.sub_child_mm_params) > 0 and all(e.ready_for_fused_merge() for e in self.sub_child_mm_params):
-                self._fuse_weights()
-                self.sub_child_mm_params.clear()
-
-            # 在线量化操作
-            if (
-                self.quant_method is not None
-                and self.mm_param.weight is not None
-                and self.quant_method.weight_need_quanted(self.mm_param.weight)
-                and self.load_finished is False
-            ):
-                logger.info(f"online quant weight names: {self.weight_names}")
-                quantized_weight, weight_scale, weight_zero_point = self.quant_method.quantize(
-                    self.mm_param.weight.cuda(get_current_device_id())
-                )
-                self.mm_param.weight = quantized_weight
-                self.mm_param.weight_scale = weight_scale
-                self.mm_param.weight_zero_point = weight_zero_point
-
-            # repack 操作
-            if (
-                self.quant_method is not None
-                and self.mm_param.is_ready()
-                and self.quant_method.params_need_repack()
-                and self.load_finished is False
-            ):
-                (
-                    self.mm_param.weight,
-                    self.mm_param.weight_scale,
-                    self.mm_param.weight_zero_point,
-                ) = self.quant_method.params_repack(
-                    weight=self.mm_param.weight,
-                    weight_scale=self.mm_param.weight_scale,
-                    weight_zero_point=self.mm_param.weight_zero_point,
-                    dtype_type=self.data_type_,
-                )
-
-            if self.mm_param.is_ready() and self.load_finished is False:
-                self._to_gpu_device()
-                self.load_finished = True
-
     def verify_load(self) -> bool:
-        return self.mm_param.is_ready()
+        return True
 
     # 执行顺序
     def _load_weight(
@@ -248,7 +139,11 @@ def _load_weight(
     ) -> None:
         if param_name in weights:
             weight = self.param_slicer._slice_weight(weights[param_name])
-            self.sub_child_mm_params[sub_child_index].weight = weight
+            start_idx = self.cusum_out_dims[sub_child_index]
+            if self.quant_method.weight_need_quanted(weight):
+                self.quant_method.quantize(weight, self.mm_param, offset=start_idx)
+            else:
+                self.quant_method.load_weight(weight, self.mm_param, start_idx)
         return
 
     def _load_bias(
@@ -256,7 +151,9 @@ def _load_bias(
     ) -> None:
         if param_name in weights:
             bias = self.param_slicer._slice_bias(weights[param_name])
-            self.sub_child_mm_params[sub_child_index].bias = bias
+            start_idx = self.cusum_out_dims[sub_child_index]
+            end_idx = start_idx + bias.shape[0]
+            self.mm_param.bias[start_idx:end_idx].copy_(bias)
         return
 
     def _load_weight_scale(
@@ -264,7 +161,8 @@ def _load_weight_scale(
     ) -> None:
         if param_name in weights:
             weight_scale = self.param_slicer._slice_weight_scale(weights[param_name])
-            self.sub_child_mm_params[sub_child_index].weight_scale = weight_scale
+            start_idx = self.cusum_out_dims[sub_child_index]
+            self.quant_method.load_weight_scale(weight_scale, self.mm_param, start_idx)
         return
 
     def _load_weight_zero_point(
@@ -272,88 +170,8 @@ def _load_weight_zero_point(
     ) -> None:
         if param_name in weights:
             weight_zero_point = self.param_slicer._slice_weight_zero_point(weights[param_name])
-            self.sub_child_mm_params[sub_child_index].weight_zero_point = weight_zero_point
-        return
-
-    # weight merge
-    def _fuse_weights(self) -> None:
-        need_merge = len(self.sub_child_mm_params) > 1
-        if self.mm_param.weight is None and all(p.weight is not None for p in self.sub_child_mm_params):
-            if need_merge:
-                weight = torch.cat([p.weight for p in self.sub_child_mm_params], dim=self.weight_fused_dim)
-            else:
-                weight = self.sub_child_mm_params[0].weight
-
-            # 快速删除，防止占用显存过久
-            for p in self.sub_child_mm_params:
-                p.weight = None
-
-            self.mm_param.weight = weight
-
-        if (
-            self.mm_param.has_bias
-            and self.mm_param.bias is None
-            and all(p.bias is not None for p in self.sub_child_mm_params)
-        ):
-            if need_merge:
-                bias = torch.cat([p.bias for p in self.sub_child_mm_params], dim=self.bias_fused_dim)
-            else:
-                bias = self.sub_child_mm_params[0].bias
-
-            # 快速删除，防止占用显存过久
-            for p in self.sub_child_mm_params:
-                p.bias = None
-
-            self.mm_param.bias = bias
-
-        if self.mm_param.weight_scale is None and all(p.weight_scale is not None for p in self.sub_child_mm_params):
-            if need_merge:
-                weight_scale = torch.cat(
-                    [p.weight_scale for p in self.sub_child_mm_params], dim=self.weight_scale_and_zero_point_fused_dim
-                )
-            else:
-                weight_scale = self.sub_child_mm_params[0].weight_scale
-
-            # 快速删除，防止占用显存过久
-            for p in self.sub_child_mm_params:
-                p.weight_scale = None
-
-            self.mm_param.weight_scale = weight_scale
-
-        if self.mm_param.weight_zero_point is None and all(
-            p.weight_zero_point is not None for p in self.sub_child_mm_params
-        ):
-            if need_merge:
-                weight_zero_point = torch.cat(
-                    [p.weight_zero_point for p in self.sub_child_mm_params],
-                    dim=self.weight_scale_and_zero_point_fused_dim,
-                )
-            else:
-                weight_zero_point = self.sub_child_mm_params[0].weight_zero_point
-
-            # 快速删除，防止占用显存过久
-            for p in self.sub_child_mm_params:
-                p.weight_zero_point = None
-
-            self.mm_param.weight_zero_point = weight_zero_point
-        return
-
-    def _to_gpu_device(self) -> None:
-        if self.mm_param.weight is not None:
-            if self.quant_method is not None:
-                self.mm_param.weight = self.mm_param.weight.cuda(get_current_device_id())
-            else:
-                # 让 k dim 更连续，大多数split k 算法的算子可能能更快
-                self.mm_param.weight = (
-                    self.mm_param.weight.to(self.data_type_).cuda(get_current_device_id()).transpose(0, 1)
-                )
-        if self.mm_param.weight_scale is not None:
-            self.mm_param.weight_scale = self.mm_param.weight_scale.cuda(get_current_device_id())
-        if self.mm_param.weight_zero_point is not None:
-            self.mm_param.weight_zero_point = self.mm_param.weight_zero_point.cuda(get_current_device_id())
-        if self.mm_param.bias is not None:
-            # TODO 是不是所有的bias都需要转换为全局设置的数据类型吗，会不会影响精度
-            self.mm_param.bias = self.mm_param.bias.to(self.data_type_).cuda(get_current_device_id())
+            start_idx = self.cusum_out_dims[sub_child_index]
+            self.quant_method.load_weight_zero_point(weight_zero_point, self.mm_param, start_idx)
         return
 
 
@@ -376,90 +194,6 @@ def bmm(
                 out = g_cache_manager.alloc_tensor(shape, dtype, device=device)
             else:
                 out = torch.empty(shape, dtype=dtype, device=device)
-        if self.mm_param.bias is None:
+        if self.bias is None:
             return torch.bmm(input_tensor, fpweight, out=out)
-        return torch.addbmm(self.mm_param.bias, input_tensor, fpweight, out=out)
-
-    def _to_gpu_device(self) -> None:
-        if self.mm_param.weight is not None:
-            if self.quant_method is not None:
-                self.mm_param.weight = self.mm_param.weight.cuda(get_current_device_id())
-            else:
-                # bmm 不需要 transpose 操作
-                self.mm_param.weight = self.mm_param.weight.to(self.data_type_).cuda(get_current_device_id())
-        if self.mm_param.weight_scale is not None:
-            self.mm_param.weight_scale = self.mm_param.weight_scale.cuda(get_current_device_id())
-        if self.mm_param.weight_zero_point is not None:
-            self.mm_param.weight_zero_point = self.mm_param.weight_zero_point.cuda(get_current_device_id())
-        if self.mm_param.bias is not None:
-            # TODO 是不是所有的bias都需要转换为全局设置的数据类型吗，会不会影响精度
-            self.mm_param.bias = self.mm_param.bias.to(self.data_type_).cuda(get_current_device_id())
-        return
-
-
-class DeepGemmFP8W8A8B128MMWeight(MMWeightTpl):
-    def __init__(
-        self,
-        weight_names: Union[str, List[str]],
-        data_type: torch.dtype,
-        bias_names: Optional[Union[str, List[str]]] = None,
-        quant_method: QuantizationMethod = None,
-        tp_rank: int = None,
-        tp_world_size: int = None,
-    ) -> None:
-        super().__init__(
-            weight_names=weight_names,
-            bias_names=bias_names,
-            data_type=data_type,
-            quant_method=quant_method,
-            tp_rank=tp_rank,
-            tp_world_size=tp_world_size,
-        )
-
-    def _to_gpu_device(self) -> None:
-        if self.mm_param.weight is not None:
-            self.mm_param.weight = self.mm_param.weight.cuda(get_current_device_id()).transpose(0, 1)
-        if self.mm_param.weight_scale is not None:
-            self.mm_param.weight_scale = self.mm_param.weight_scale.cuda(get_current_device_id()).transpose(0, 1)
-
-        assert self.mm_param.has_weight_zero_point is False
-
-        if self.mm_param.bias is not None:
-            # TODO 是不是所有的bias都需要转换为全局设置的数据类型吗，会不会影响精度
-            self.mm_param.bias = self.mm_param.bias.to(self.data_type_).cuda(get_current_device_id())
-        return
-
-
-class AWQMMWeightTpl(MMWeightTpl):
-    def __init__(
-        self,
-        weight_names: Union[str, List[str]],
-        bias_names: Optional[Union[str, List[str]]] = None,
-        data_type: torch.dtype = None,
-        quant_method: QuantizationMethod = None,
-        tp_rank: int = None,
-        tp_world_size: int = None,
-    ) -> None:
-        super().__init__(
-            weight_names=weight_names,
-            bias_names=bias_names,
-            data_type=data_type,
-            quant_method=quant_method,
-            tp_rank=tp_rank,
-            tp_world_size=tp_world_size,
-        )
-        self.weight_fused_dim = 1
-        self.bias_fused_dim = 0
-        self.weight_scale_and_zero_point_fused_dim = 1
-
-    def _to_gpu_device(self) -> None:
-        if self.mm_param.weight is not None:
-            self.mm_param.weight = self.mm_param.weight.cuda(get_current_device_id())
-        if self.mm_param.weight_scale is not None:
-            self.mm_param.weight_scale = self.mm_param.weight_scale.to(self.data_type_).cuda(get_current_device_id())
-        if self.mm_param.weight_zero_point is not None:
-            self.mm_param.weight_zero_point = self.mm_param.weight_zero_point.cuda(get_current_device_id())
-        if self.mm_param.bias is not None:
-            # TODO 是不是所有的bias都需要转换为全局设置的数据类型吗，会不会影响精度
-            self.mm_param.bias = self.mm_param.bias.to(self.data_type_).cuda(get_current_device_id())
-        return
+        return torch.addbmm(self.bias, input_tensor, fpweight, out=out)
diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/rowmm_weight.py b/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/rowmm_weight.py
index 0eebdc74d..e53d643ce 100644
--- a/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/rowmm_weight.py
+++ b/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/rowmm_weight.py
@@ -1,20 +1,20 @@
 import torch
 from lightllm.common.basemodel.layer_weights.meta_weights.mm_weight.mm_weight import (
     MMWeightTpl,
-    DeepGemmFP8W8A8B128MMWeight,
-    AWQMMWeightTpl,
     BMMWeightTpl,
 )
 from lightllm.common.quantization import Quantcfg
 from lightllm.utils.dist_utils import get_current_device_id
 from lightllm.common.quantization.quantize_method import QuantizationMethod
 from typing import Dict, List, Optional, Union
-from .mm_slicer import RowSliceMixin, QuantizedRowSliceMixin, AwqQuantizedRowSliceMixin
+from .mm_slicer import get_row_slice_mixin
 
 
-class StandardROWMMWeight(MMWeightTpl):
+class ROWMMWeight(MMWeightTpl):
     def __init__(
         self,
+        in_dim: int,
+        out_dims: Optional[Union[int, List[int]]],
         weight_names: Union[str, List[str]],
         data_type: torch.dtype,
         bias_names: Optional[Union[str, List[str]]] = None,
@@ -23,6 +23,8 @@ def __init__(
         tp_world_size: int = None,
     ) -> None:
         super().__init__(
+            in_dim=in_dim,
+            out_dims=out_dims,
             weight_names=weight_names,
             bias_names=bias_names,
             data_type=data_type,
@@ -30,32 +32,12 @@ def __init__(
             tp_rank=tp_rank,
             tp_world_size=tp_world_size,
         )
-        self.param_slicer = RowSliceMixin(tp_rank=tp_rank, tp_world_size=tp_world_size)
-
-
-class DeepGemmFP8W8A8B128ROWMMWeight(DeepGemmFP8W8A8B128MMWeight):
-    def __init__(
-        self,
-        weight_names: Union[str, List[str]],
-        data_type: torch.dtype,
-        bias_names: Optional[Union[str, List[str]]] = None,
-        quant_method: QuantizationMethod = None,
-        tp_rank: int = None,
-        tp_world_size: int = None,
-    ) -> None:
-        super().__init__(
-            weight_names=weight_names,
-            data_type=data_type,
-            bias_names=bias_names,
-            quant_method=quant_method,
-            tp_rank=tp_rank,
-            tp_world_size=tp_world_size,
+        self.param_slicer = get_row_slice_mixin(
+            self.quant_method.method_name, tp_rank=tp_rank, tp_world_size=tp_world_size
         )
-        self.param_slicer = QuantizedRowSliceMixin(tp_rank=tp_rank, tp_world_size=tp_world_size)
-        return
 
 
-class UnquantizedROWBMMWeight(BMMWeightTpl):
+class ROWBMMWeight(BMMWeightTpl):
     def __init__(
         self,
         weight_names: Union[str, List[str]],
@@ -73,53 +55,5 @@ def __init__(
             tp_rank=tp_rank,
             tp_world_size=tp_world_size,
         )
-        self.param_slicer = RowSliceMixin(tp_rank=tp_rank, tp_world_size=tp_world_size)
-
-
-class AWQROWMMWeight(AWQMMWeightTpl):
-    def __init__(
-        self,
-        weight_names: Union[str, List[str]],
-        data_type: torch.dtype,
-        bias_names: Optional[Union[str, List[str]]] = None,
-        quant_method: QuantizationMethod = None,
-        tp_rank: int = None,
-        tp_world_size: int = None,
-    ) -> None:
-        super().__init__(
-            weight_names=weight_names,
-            data_type=data_type,
-            bias_names=bias_names,
-            quant_method=quant_method,
-            tp_rank=tp_rank,
-            tp_world_size=tp_world_size,
-        )
-
-        self.param_slicer = AwqQuantizedRowSliceMixin(tp_rank=tp_rank, tp_world_size=tp_world_size)
-
-
-class AWQMARLINROWMMWeight(AWQROWMMWeight):
-    def __init__(
-        self,
-        weight_names: Union[str, List[str]],
-        data_type: torch.dtype,
-        bias_names: Optional[Union[str, List[str]]] = None,
-        quant_method: QuantizationMethod = None,
-        tp_rank: int = None,
-        tp_world_size: int = None,
-    ) -> None:
-        super().__init__(
-            weight_names=weight_names,
-            data_type=data_type,
-            bias_names=bias_names,
-            quant_method=quant_method,
-            tp_rank=tp_rank,
-            tp_world_size=tp_world_size,
-        )
-
-
-ROWMM_WEIGHT_CLS_MAP = {
-    "deepgemm-fp8w8a8-b128": DeepGemmFP8W8A8B128ROWMMWeight,
-    "awq": AWQROWMMWeight,
-    "awq_marlin": AWQMARLINROWMMWeight,
-}
+        # bmm 不支持量化运算操作
+        self.param_slicer = get_row_slice_mixin(quant_method_name="none", tp_rank=tp_rank, tp_world_size=tp_world_size)
diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/norm_weight.py b/lightllm/common/basemodel/layer_weights/meta_weights/norm_weight.py
index 5a595bff6..619158fa8 100644
--- a/lightllm/common/basemodel/layer_weights/meta_weights/norm_weight.py
+++ b/lightllm/common/basemodel/layer_weights/meta_weights/norm_weight.py
@@ -9,23 +9,36 @@
 logger = init_logger(__name__)
 
 
-class _NormWeight(BaseWeightTpl):
-    def __init__(self, weight_name, data_type, bias_name=None):
+class NormWeight(BaseWeightTpl):
+    def __init__(self, norm_dim: int, weight_name, data_type, bias_name=None):
         super().__init__()
+        self.norm_dim = norm_dim
         self.weight_name = weight_name
         self.bias_name = bias_name
         self.data_type_ = data_type
-        self.weight: torch.Tensor = None
-        self.bias: Optional[torch.Tensor] = None
+        self.weight = None
+        self.bias = None
+        self.is_weight_ready = False
+        self.is_bias_ready = False
+        self._create_weight()
+
+    def _create_weight(self):
+        device = f"cuda:{get_current_device_id()}"
+        self.weight = torch.empty(self.norm_dim, dtype=self.data_type_, device=device)
+        self.bias = (
+            torch.empty(self.norm_dim, dtype=self.data_type_, device=device) if self.bias_name is not None else None
+        )
+
+    def load_hf_weights(self, weights):
+        if self.weight_name in weights:
+            self.weight.copy_(weights[self.weight_name])
+            self.is_weight_ready = True
+        if self.bias_name in weights:
+            self.bias.copy_(weights[self.bias_name])
+            self.is_bias_ready = True
 
     def verify_load(self):
-        load_ok = True
-        # Verify weight. The weight must be not None.
-        load_ok = load_ok and self.weight is not None
-        # Verify bias. If bias_name is set, it must be not None.
-        if self.bias_name is not None:
-            load_ok = load_ok and self.bias is not None
-        return load_ok
+        return self.is_weight_ready and (self.bias_name is None or self.is_bias_ready)
 
     def rmsnorm_forward(
         self, input: torch.Tensor, eps: float, out: Optional[torch.Tensor] = None, alloc_func=torch.empty
@@ -36,108 +49,29 @@ def rmsnorm_forward(
             out = alloc_func(input.shape, dtype=input.dtype, device=input.device)
         return rmsnorm_forward(x=input, weight=self.weight, eps=eps, out=out)
 
-    def layernorm_forward(
-        self, input: torch.Tensor, eps: float, out: Optional[torch.Tensor] = None, alloc_func=torch.empty
-    ) -> torch.Tensor:
-        assert input.ndim == 2 and self.weight.ndim == 1
-        assert self.bias is not None
-
-        _tout = layernorm_forward(x=input, weight=self.weight, bias=self.bias, eps=eps)
-        if out is None:
-            return _tout
-        else:
-            out.copy_(_tout)
-            return out
-
 
-class NoTpNormWeight(_NormWeight):
-    def __init__(self, weight_name, data_type, bias_name=None):
-        super().__init__(weight_name=weight_name, data_type=data_type, bias_name=bias_name)
-        self.tp_world_size_ = 1
-        self.tp_rank_ = 0
+class GEMMANormWeight(NormWeight):
+    def __init__(self, norm_dim: int, weight_name, data_type, bias_name=None):
+        super().__init__(norm_dim, weight_name, data_type, bias_name)
 
     def load_hf_weights(self, weights):
-        if self.weight_name in weights and self.weight is None:
-            self.weight = weights[self.weight_name].to(self.data_type_).cuda(get_current_device_id())
-        if self.bias_name in weights and self.bias is None:
-            self.bias = weights[self.bias_name].to(self.data_type_).cuda(get_current_device_id())
+        # TODO: 这里直接 +1 会不会导致精度问题? 计算时要求 (1.0 + weight.float()) ?
+        if self.weight_name in weights:
+            self.weight.copy_((weights[self.weight_name] + 1).to(self.data_type_))
+            self.is_weight_ready = True
 
 
-class NoTpGEMMANormWeight(_NormWeight):
-    def __init__(self, weight_name, data_type, bias_name=None):
-        super().__init__(weight_name, data_type, bias_name)
-        assert self.bias_name is None
-        self.tp_world_size_ = 1
-        self.tp_rank_ = 0
+class TpNormWeight(NormWeight):
+    def __init__(self, norm_dim: int, weight_name, data_type, bias_name=None):
+        super().__init__(norm_dim, weight_name, data_type, bias_name)
 
     def load_hf_weights(self, weights):
-        if self.weight_name in weights and self.weight is None:
-            self.weight = (weights[self.weight_name] + 1).to(self.data_type_).cuda(get_current_device_id())
-
-
-class TpVitPadNormWeight(_NormWeight):
-    def __init__(self, weight_name, data_type, head_num: int, bias_name=None):
-        super().__init__(weight_name, data_type, bias_name)
-        self.head_num = head_num
-
-    def _pad_tensor_param(self, weight: torch.Tensor):
-        assert weight.ndim == 1
-        hidden_size = weight.shape[0]
-        head_dim = hidden_size // self.head_num
-        assert hidden_size % self.head_num == 0
-
-        if self.head_num % self.tp_world_size_ == 0:
-            return weight
-        else:
-            logger.warning(f"padding {self.weight_name} weights in TpVitPadNormWeight")
-            pad_head_num = self.tp_world_size_ - (self.head_num % self.tp_world_size_)
-            pad_dims = pad_head_num * head_dim
-            weight = torch.nn.functional.pad(weight, (0, pad_dims), mode="constant", value=0.0)
-            return weight
-
-    def load_hf_weights(self, weights):
-        if self.weight_name in weights and self.weight is None:
-            t_weight = weights[self.weight_name]
-            t_weight = self._pad_tensor_param(t_weight)
-            new_hidden_size = t_weight.shape[0]
-            split_n_embed = new_hidden_size // self.tp_world_size_
-            assert new_hidden_size % self.tp_world_size_ == 0
-
-            start = split_n_embed * self.tp_rank_
-            end = split_n_embed * (self.tp_rank_ + 1)
-
-            self.weight = t_weight[start:end].to(self.data_type_).cuda(get_current_device_id())
-
-        if self.bias_name in weights and self.bias is None:
-            t_bias = weights[self.bias_name]
-            t_bias = self._pad_tensor_param(t_bias)
-            new_hidden_size = t_bias.shape[0]
-            split_n_embed = new_hidden_size // self.tp_world_size_
-            assert new_hidden_size % self.tp_world_size_ == 0
-
-            start = split_n_embed * self.tp_rank_
-            end = split_n_embed * (self.tp_rank_ + 1)
-
-            self.bias = t_bias[start:end].to(self.data_type_).cuda(get_current_device_id())
-
-
-class TpHeadNormWeight(_NormWeight):
-    def __init__(self, weight_name, data_type, bias_name=None):
-        super().__init__(weight_name, data_type, bias_name)
-
-    def load_hf_weights(self, weights):
-        if self.weight_name in weights and self.weight is None:
-            t_weight = weights[self.weight_name]
-            start_head_index, end_head_index = self._get_head_tp_split_params(weight=t_weight)
-            self.weight: torch.Tensor = (
-                t_weight[start_head_index:end_head_index].to(self.data_type_).cuda(get_current_device_id())
-            )
-            assert self.weight.ndim == 2
-
-        if self.bias_name in weights and self.bias is None:
-            t_bias = weights[self.bias_name]
-            start_head_index, end_head_index = self._get_head_tp_split_params(weight=t_bias)
-            self.bias: torch.Tensor = (
-                t_bias[start_head_index:end_head_index].to(self.data_type_).cuda(get_current_device_id())
-            )
-            assert self.bias.ndim == 2
+        start = self.norm_dim * self.tp_rank_
+        end = self.norm_dim * (self.tp_rank_ + 1)
+
+        if self.weight_name in weights:
+            self.weight.copy_(weights[self.weight_name][start:end].to(self.data_type_))
+            self.is_weight_ready = True
+        if self.bias_name in weights:
+            self.bias.copy_(weights[self.bias_name][start:end].to(self.data_type_))
+            self.is_bias_ready = True
diff --git a/lightllm/common/basemodel/layer_weights/transformer_layer_weight.py b/lightllm/common/basemodel/layer_weights/transformer_layer_weight.py
index 4bc58c76f..c6ce1049f 100644
--- a/lightllm/common/basemodel/layer_weights/transformer_layer_weight.py
+++ b/lightllm/common/basemodel/layer_weights/transformer_layer_weight.py
@@ -4,6 +4,7 @@
 from .base_layer_weight import BaseLayerWeight
 from .meta_weights import BaseWeight, MMWeightTpl
 from lightllm.utils.log_utils import init_logger
+from lightllm.common.quantization import Quantcfg
 
 logger = init_logger(__name__)
 
@@ -40,3 +41,6 @@ def load_hf_weights(self, weights):
                     attr.load_hf_weights(weights)
             elif isinstance(attr, BaseWeight):
                 attr.load_hf_weights(weights)
+
+    def get_quant_method(self, name):
+        return self.quant_cfg.get_quant_method(self.layer_num_, name)
diff --git a/lightllm/common/quantization/__init__.py b/lightllm/common/quantization/__init__.py
index 26f59258c..ecf2e6d42 100644
--- a/lightllm/common/quantization/__init__.py
+++ b/lightllm/common/quantization/__init__.py
@@ -6,6 +6,7 @@
 from .triton_quant.triton_quant import *
 from .deepgemm_quant import *
 from .awq_quant import *
+from .no_quant import *
 from lightllm.utils.log_utils import init_logger
 
 logger = init_logger(__name__)
@@ -78,4 +79,6 @@ def get_quant_type(self, layer_num, name):
 
     def get_quant_method(self, layer_num, name):
         quant_type = self.get_quant_type(layer_num, name)
-        return QUANTMETHODS.get(quant_type)
+        quant_method = QUANTMETHODS.get(quant_type)
+        quant_method.hf_quantization_config = self.hf_quantization_config
+        return quant_method
diff --git a/lightllm/common/quantization/awq_quant.py b/lightllm/common/quantization/awq_quant.py
index 8c04cdcea..d523cce75 100644
--- a/lightllm/common/quantization/awq_quant.py
+++ b/lightllm/common/quantization/awq_quant.py
@@ -9,8 +9,7 @@
 from typing import TYPE_CHECKING, Optional, Tuple
 from lightllm.utils.dist_utils import get_current_device_id
 
-if TYPE_CHECKING:
-    from lightllm.common.basemodel.layer_weights.meta_weights.mm_weight.mm_weight import MMWeightPack
+from .quantize_method import WeightPack
 
 if HAS_VLLM:
     awq_dequantize = vllm_ops.awq_dequantize
@@ -39,16 +38,17 @@ def __init__(self):
 
         self.cache_manager = g_cache_manager
 
-    def quantize(self, weight: torch.Tensor):
+    def quantize(self, weight: torch.Tensor, output: WeightPack, offset: int = 0):
         raise NotImplementedError("AWQ online quantization is not supported yet.")
 
     def apply(
         self,
         input_tensor: torch.Tensor,
-        weight_pack: "MMWeightPack",
+        weight_pack: WeightPack,
         out: Optional[torch.Tensor] = None,
         workspace: Optional[torch.Tensor] = None,
         use_custom_tensor_mananger: bool = True,
+        bias: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         raise NotImplementedError("AWQ online quantization is not supported yet.")
 
@@ -72,21 +72,21 @@ def __init__(self):
     def method_name(self):
         return "awq"
 
-    def quantize(self, weight: torch.Tensor):
+    def quantize(self, weight: torch.Tensor, output: WeightPack, offset: int = 0):
         raise NotImplementedError("AWQ online quantization is not supported yet.")
 
     def apply(
         self,
         input_tensor: torch.Tensor,
-        weight_pack: "MMWeightPack",
+        weight_pack: WeightPack,
         out: Optional[torch.Tensor] = None,
         workspace: Optional[torch.Tensor] = None,
         use_custom_tensor_mananger: bool = True,
+        bias: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         qweight = weight_pack.weight
         weight_scale = weight_pack.weight_scale
         qzeros = weight_pack.weight_zero_point
-        bias = weight_pack.bias
 
         NEED_DEQUANT_WEIGHT = input_tensor.shape[:-1].numel() >= 256
         if NEED_DEQUANT_WEIGHT:
@@ -99,6 +99,33 @@ def apply(
             out.add_(bias)
         return out
 
+    def create_weight(
+        self, out_dim: int, in_dim: int, dtype: torch.dtype, device_id: int, num_experts: int = 1
+    ) -> WeightPack:
+        group_size = self.hf_quantization_config["group_size"]
+        expert_prefix = (num_experts,) if num_experts > 1 else ()
+        weight = torch.empty(expert_prefix + (in_dim, out_dim // self.pack_factor), dtype=torch.int32).cuda(device_id)
+        weight_scale = torch.empty(expert_prefix + (in_dim // group_size, out_dim), dtype=dtype).cuda(device_id)
+        weight_zero_point = torch.empty(
+            expert_prefix + (in_dim // group_size, out_dim // self.pack_factor), dtype=torch.int32
+        ).cuda(device_id)
+        return WeightPack(weight=weight, weight_scale=weight_scale, weight_zero_point=weight_zero_point)
+
+    def load_weight(self, weight: torch.Tensor, weight_pack: WeightPack, start_idx: int) -> None:
+        start_idx = start_idx // self.pack_factor
+        weight_pack.weight[:, start_idx : start_idx + weight.shape[1]].copy_(weight)
+        return
+
+    def load_weight_scale(self, weight_scale: torch.Tensor, weight_pack: WeightPack, start_idx: int) -> None:
+        weight_pack.weight_scale[:, start_idx : start_idx + weight_scale.shape[1]].copy_(weight_scale)
+        return
+
+    def load_weight_zero_point(self, weight_zero_point: torch.Tensor, weight_pack: WeightPack, start_idx: int) -> None:
+        start_idx = start_idx // self.pack_factor
+        end_idx = start_idx + weight_zero_point.shape[1]
+        weight_pack.weight_zero_point[:, start_idx:end_idx].copy_(weight_zero_point)
+        return
+
 
 @QUANTMETHODS.register("awq_marlin")
 class AWQMARLINW4A16QuantizationMethod(AWQBaseQuantizationMethod):
@@ -115,20 +142,15 @@ def __init__(self):
         self.vllm_quant_type = TYPE_MAP[self.nbits]
         self.has_weight_scale = True
         self.has_weight_zero_point = True
+        self.tile_size = 16
 
     @property
     def method_name(self):
         return "awq_marlin"
 
-    def quantize(self, weight: torch.Tensor):
+    def quantize(self, weight: torch.Tensor, offset: int = 0) -> WeightPack:
         raise NotImplementedError("AWQ online quantization is not supported yet.")
 
-    def params_need_repack(self) -> bool:
-        """
-        用于说明是否需要对量化后的权重进行repack操作，目前只有awq支持
-        """
-        return True
-
     def params_repack(
         self, weight: torch.Tensor, weight_scale: torch.Tensor, weight_zero_point: torch.Tensor, dtype_type: torch.dtype
     ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
@@ -144,47 +166,18 @@ def params_repack(
         )
         return weight, weight_scale, weight_zero_point
 
-    def _process_weight_after_loading(self, weight: torch.Tensor) -> torch.Tensor:
-        assert self.hf_quantization_config is not None, "hf_quantization_config is not set"
-        self.k = weight.shape[0]
-        self.n = weight.shape[1] * self.pack_factor
-        return vllm_ops.awq_marlin_repack(
-            weight,
-            size_k=weight.shape[0],
-            size_n=weight.shape[1] * self.pack_factor,
-            num_bits=self.hf_quantization_config["bits"],
-        )
-
-    def _process_weight_scale_after_loading(self, weight_scale: torch.Tensor) -> torch.Tensor:
-        assert self.hf_quantization_config is not None, "hf_quantization_config is not set"
-        group_size = self.hf_quantization_config["group_size"]
-        return marlin_permute_scales(
-            weight_scale,
-            size_k=weight_scale.shape[0] * group_size,
-            size_n=weight_scale.shape[1],
-            group_size=self.hf_quantization_config["group_size"],
-        )
-
-    def _process_weight_zero_point_after_loading(self, weight_zero_point: torch.Tensor) -> torch.Tensor:
-        return awq_to_marlin_zero_points(
-            weight_zero_point,
-            size_k=weight_zero_point.shape[0],
-            size_n=weight_zero_point.shape[1] * self.pack_factor,
-            num_bits=self.hf_quantization_config["bits"],
-        )
-
     def apply(
         self,
         input_tensor: torch.Tensor,
-        weight_pack: "MMWeightPack",
+        weight_pack: WeightPack,
         out: Optional[torch.Tensor] = None,
         workspace: Optional[torch.Tensor] = None,
         use_custom_tensor_mananger: bool = True,
+        bias: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         qweight = weight_pack.weight
         weight_scale = weight_pack.weight_scale
         qzeros = weight_pack.weight_zero_point
-        bias = weight_pack.bias
         reshaped_x = input_tensor.reshape(-1, input_tensor.shape[-1])
 
         use_atomic_add = should_use_atomic_add_reduce(
@@ -219,6 +212,62 @@ def apply(
             out.add_(bias)
         return out
 
+    def create_weight(
+        self, out_dim: int, in_dim: int, dtype: torch.dtype, device_id: int, num_experts: int = 1
+    ) -> WeightPack:
+        self.n = out_dim
+        self.k = in_dim
+        group_size = self.hf_quantization_config["group_size"]
+        expert_prefix = (num_experts,) if num_experts > 1 else ()
+        weight = torch.empty(
+            expert_prefix + (in_dim // self.tile_size, out_dim * self.tile_size // self.pack_factor), dtype=torch.int32
+        ).cuda(device_id)
+        weight_scale = torch.empty(expert_prefix + (in_dim // group_size, out_dim), dtype=dtype).cuda(device_id)
+        weight_zero_point = torch.empty(
+            expert_prefix + (in_dim // group_size, out_dim // self.pack_factor), dtype=torch.int32
+        ).cuda(device_id)
+        return WeightPack(weight=weight, weight_scale=weight_scale, weight_zero_point=weight_zero_point)
+
+    def load_weight(self, weight: torch.Tensor, weight_pack: WeightPack, start_idx: int) -> None:
+        assert self.hf_quantization_config is not None, "hf_quantization_config is not set"
+        device_id = get_current_device_id()
+        repack_weight = vllm_ops.awq_marlin_repack(
+            weight.cuda(device_id),
+            size_k=weight.shape[0],
+            size_n=weight.shape[1] * self.pack_factor,
+            num_bits=self.hf_quantization_config["bits"],
+        )
+        start_idx = start_idx // self.pack_factor * self.tile_size
+        weight_pack.weight[:, start_idx : start_idx + repack_weight.shape[1]].copy_(repack_weight)
+        return
+
+    def load_weight_scale(self, weight_scale: torch.Tensor, weight_pack: WeightPack, start_idx: int) -> None:
+        assert self.hf_quantization_config is not None, "hf_quantization_config is not set"
+        group_size = self.hf_quantization_config["group_size"]
+        device_id = get_current_device_id()
+        repack_weight_scale = marlin_permute_scales(
+            weight_scale.cuda(device_id),
+            size_k=weight_scale.shape[0] * group_size,
+            size_n=weight_scale.shape[1],
+            group_size=self.hf_quantization_config["group_size"],
+        )
+        weight_pack.weight_scale[:, start_idx : start_idx + repack_weight_scale.shape[1]].copy_(repack_weight_scale)
+        return
+
+    def load_weight_zero_point(self, weight_zero_point: torch.Tensor, weight_pack: WeightPack, start_idx: int) -> None:
+        device_id = get_current_device_id()
+        repack_weight_zero_point = awq_to_marlin_zero_points(
+            weight_zero_point.cuda(device_id),
+            size_k=weight_zero_point.shape[0],
+            size_n=weight_zero_point.shape[1] * self.pack_factor,
+            num_bits=self.hf_quantization_config["bits"],
+        )
+        start_idx = start_idx // self.pack_factor
+        weight_pack.weight_zero_point[:, start_idx : start_idx + repack_weight_zero_point.shape[1]].copy_(
+            repack_weight_zero_point
+        )
+        return
+
 
 # adapted from
 # https://github.com/vllm-project/vllm/blob/aef368aa08572505b820db01da82e2fbb3d43a72/vllm/model_executor/layers/quantization/awq_marlin.py#L211-L212
diff --git a/lightllm/common/quantization/deepgemm_quant.py b/lightllm/common/quantization/deepgemm_quant.py
index 7dbd3806b..86dd9b572 100644
--- a/lightllm/common/quantization/deepgemm_quant.py
+++ b/lightllm/common/quantization/deepgemm_quant.py
@@ -1,5 +1,6 @@
 import os
 import torch
+from torch.types import Device
 from .quantize_method import QuantizationMethod
 from .registry import QUANTMETHODS
 import torch.nn.functional as F
@@ -9,8 +10,8 @@
 )
 from typing import TYPE_CHECKING, Optional
 
-if TYPE_CHECKING:
-    from lightllm.common.basemodel.layer_weights.meta_weights.mm_weight.mm_weight import MMWeightPack
+from .quantize_method import WeightPack
+
 try:
     HAS_DEEPGEMM = True
     import deep_gemm
@@ -26,17 +27,17 @@ def __init__(self):
         self.cache_manager = g_cache_manager
         assert HAS_DEEPGEMM, "deepgemm is not installed, you can't use quant api of it"
 
-    def quantize(self, weight: torch.Tensor):
-        """ """
-        pass
+    def quantize(self, weight: torch.Tensor, output: WeightPack, offset: int = 0):
+        raise NotImplementedError("Not implemented")
 
     def apply(
         self,
         input_tensor: torch.Tensor,
-        weight_pack: "MMWeightPack",
+        weight_pack: WeightPack,
         out: Optional[torch.Tensor] = None,
         workspace: Optional[torch.Tensor] = None,
         use_custom_tensor_mananger: bool = True,
+        bias: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         raise NotImplementedError("Not implemented")
 
@@ -60,26 +61,30 @@ def __init__(self):
     def method_name(self):
         return "deepgemm-fp8w8a8-b128"
 
-    def quantize(self, weight: torch.Tensor):
+    def quantize(self, weight: torch.Tensor, output: WeightPack, offset: int = 0):
         from lightllm.common.quantization.triton_quant.fp8.fp8w8a8_block_quant_kernel import weight_quant
 
-        weight, scale = weight_quant(weight, self.block_size)
-        return weight, scale, None
+        device = output.weight.device
+        weight, scale = weight_quant(weight.cuda(device), self.block_size)
+        output.weight[offset : offset + weight.shape[0], :].copy_(weight)
+        output.weight_scale[offset // self.block_size : offset + weight.shape[0] // self.block_size].copy_(scale)
+        return
 
     def apply(
         self,
         input_tensor: torch.Tensor,
-        weight_pack: "MMWeightPack",
+        weight_pack: "WeightPack",
         out: Optional[torch.Tensor] = None,
         workspace: Optional[torch.Tensor] = None,
         use_custom_tensor_mananger: bool = True,
+        bias: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         qweight = weight_pack.weight
         weight_scale = weight_pack.weight_scale
         input_scale = None
         alloc_func = torch.empty if not use_custom_tensor_mananger else self.cache_manager.empty
         m, k = input_tensor.shape
-        n = qweight.shape[1]
+        n = qweight.shape[0]
         if input_scale is None:
             qinput_tensor, input_scale = per_token_group_quant_fp8(
                 input_tensor,
@@ -92,9 +97,35 @@ def apply(
 
         if out is None:
             out = alloc_func((m, n), dtype=input_tensor.dtype, device=input_tensor.device)
-        _deepgemm_fp8_nt((qinput_tensor, input_scale), (qweight.t(), weight_scale.t()), out)
+        _deepgemm_fp8_nt((qinput_tensor, input_scale), (qweight, weight_scale), out)
         return out
 
+    def create_weight(
+        self, out_dim: int, in_dim: int, dtype: torch.dtype, device_id: int, num_experts: int = 1
+    ) -> WeightPack:
+        expert_prefix = (num_experts,) if num_experts > 1 else ()
+        weight = torch.empty(expert_prefix + (out_dim, in_dim), dtype=torch.float8_e4m3fn).cuda(device_id)
+        weight_scale = torch.empty(
+            expert_prefix + (out_dim // self.block_size, in_dim // self.block_size), dtype=torch.float32
+        ).cuda(device_id)
+        return WeightPack(weight=weight, weight_scale=weight_scale)
+
+    def load_weight(self, weight: torch.Tensor, weight_pack: WeightPack, start_idx: int) -> None:
+        weight_pack.weight[start_idx : start_idx + weight.shape[0]].copy_(weight)
+        return
+
+    def load_weight_scale(self, weight_scale: torch.Tensor, weight_pack: WeightPack, start_idx: int) -> None:
+        weight_pack.weight_scale[
+            start_idx // self.block_size : start_idx + weight_scale.shape[0] // self.block_size
+        ].copy_(weight_scale)
+        return
+
+    def load_weight_zero_point(self, weight_zero_point: torch.Tensor, weight_pack: WeightPack, start_idx: int) -> None:
+        weight_pack.weight_zero_point[
+            start_idx // self.block_size : start_idx + weight_zero_point.shape[0] // self.block_size
+        ].copy_(weight_zero_point)
+        return
+
 
 def _deepgemm_fp8_nt(a_tuple, b_tuple, out):
     if HAS_DEEPGEMM:
diff --git a/lightllm/common/quantization/no_quant.py b/lightllm/common/quantization/no_quant.py
new file mode 100644
index 000000000..f342607c1
--- /dev/null
+++ b/lightllm/common/quantization/no_quant.py
@@ -0,0 +1,52 @@
+from .quantize_method import QuantizationMethod, WeightPack
+from .registry import QUANTMETHODS
+import torch
+from typing import Optional
+
+
+@QUANTMETHODS.register("none")
+class NoQuantization(QuantizationMethod):
+    def apply(
+        self,
+        input_tensor: torch.Tensor,
+        weight_pack: WeightPack,
+        out: Optional[torch.Tensor] = None,
+        workspace: Optional[torch.Tensor] = None,
+        use_custom_tensor_mananger: bool = True,
+        bias: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        from lightllm.common.basemodel.layer_infer.cache_tensor_manager import g_cache_manager
+
+        weight = weight_pack.weight.t()
+        if out is None:
+            shape = (input_tensor.shape[0], weight.shape[1])
+            dtype = input_tensor.dtype
+            device = input_tensor.device
+            if use_custom_tensor_mananger:
+                out = g_cache_manager.alloc_tensor(shape, dtype, device=device, is_graph_out=False)
+            else:
+                out = torch.empty(shape, dtype=dtype, device=device)
+        if bias is None:
+            return torch.mm(input_tensor, weight, out=out)
+        return torch.addmm(bias, input_tensor, weight, out=out)
+
+    def create_weight(
+        self, out_dim: int, in_dim: int, dtype: torch.dtype, device_id: int, num_experts: int = 1
+    ) -> WeightPack:
+        expert_prefix = (num_experts,) if num_experts > 1 else ()
+        weight = torch.empty(expert_prefix + (out_dim, in_dim), dtype=dtype).cuda(device_id)
+        return WeightPack(weight=weight, weight_scale=None, weight_zero_point=None)
+
+    def weight_need_quanted(self, weight: torch.Tensor) -> bool:
+        return False
+
+    def quantize(self, weight: torch.Tensor, output: WeightPack, offset: int = 0) -> None:
+        return
+
+    @property
+    def method_name(self):
+        return "none"
+
+    def load_weight(self, weight: torch.Tensor, weight_pack: WeightPack, start_idx: int = 0) -> None:
+        weight_pack.weight[start_idx : start_idx + weight.shape[0], :].copy_(weight)
+        return
diff --git a/lightllm/common/quantization/quantize_method.py b/lightllm/common/quantization/quantize_method.py
index 9b629bcaf..77e59465e 100644
--- a/lightllm/common/quantization/quantize_method.py
+++ b/lightllm/common/quantization/quantize_method.py
@@ -1,38 +1,58 @@
 import torch
 from abc import ABC, abstractmethod
+from dataclasses import dataclass
 from lightllm.utils.dist_utils import get_current_device_id
-from typing import TYPE_CHECKING, Optional, Tuple
+from typing import Optional, Tuple
 
-if TYPE_CHECKING:
-    from lightllm.common.basemodel.layer_weights.meta_weights.mm_weight.mm_weight import MMWeightPack
+
+@dataclass
+class WeightPack:
+    weight: Optional[torch.Tensor] = None
+    weight_scale: Optional[torch.Tensor] = None
+    weight_zero_point: Optional[torch.Tensor] = None
+
+    def get_expert(self, expert_idx: int):
+        assert self.weight.ndim == 3, f"weight must be a 3D tensor, but got {self.weight.ndim}"
+        weight = self.weight[expert_idx]
+        weight_scale = self.weight_scale[expert_idx] if self.weight_scale is not None else None
+        weight_zero_point = self.weight_zero_point[expert_idx] if self.weight_zero_point is not None else None
+        return WeightPack(weight=weight, weight_scale=weight_scale, weight_zero_point=weight_zero_point)
 
 
 class QuantizationMethod(ABC):
     def __init__(self):
         super().__init__()
         self.device_id_ = get_current_device_id()
-        self.weight_suffix = None
+        self.weight_suffix = "weight"
         self.weight_scale_suffix = None
         self.weight_zero_point_suffix = None
         self.act_scale_suffix = None
         self.has_weight_scale: bool = None
         self.has_weight_zero_point: bool = None
+        self.group_size: int = -1  # -1表示不分组即per-channel量化，其他表示分组大小
+        self.pack_factor: int = 1
+
         # 一些量化模式需要用到的额外量化参数，如awq量化
         self.hf_quantization_config = None
 
     @abstractmethod
-    def quantize(self, weights: torch.Tensor):
+    def quantize(
+        self,
+        weight: torch.Tensor,
+        output: WeightPack,
+        offset: int = 0,
+    ) -> None:
         pass
 
     @abstractmethod
     def apply(
         self,
         input_tensor: torch.Tensor,
-        weight_pack: "MMWeightPack",
-        bias: Optional[torch.Tensor] = None,
+        weight_pack: "WeightPack",
         out: Optional[torch.Tensor] = None,
         workspace: Optional[torch.Tensor] = None,
         use_custom_tensor_mananger: bool = True,
+        bias: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         pass
 
@@ -41,20 +61,26 @@ def apply(
     def method_name(self):
         pass
 
+    def create_weight(
+        self, out_dim: int, in_dim: int, dtype: torch.dtype, device_id: int, num_experts: int = 1
+    ) -> WeightPack:
+        pass
+
     def weight_need_quanted(self, weight: torch.Tensor) -> bool:
         # 判断一个 weight 是否需要进行量化操作。
         return weight.dtype in [torch.bfloat16, torch.float16, torch.float32, torch.float64]
 
-    def params_need_repack(self) -> bool:
-        """
-        用于说明是否需要对量化后的权重进行repack操作，目前只有awq支持
-        """
-        return False
-
-    def params_repack(
-        self, weight: torch.Tensor, weight_scale: torch.Tensor, weight_zero_point: torch.Tensor, dtype_type: torch.dtype
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        """
-        一些量化方法在将参数完成量化后，为了加速性能，还需要将参数进行重拍，使算子性能达到最优，如awq方法。
-        """
-        return weight, weight_scale, weight_zero_point
+    def load_weight(self, weight: torch.Tensor, weight_pack: WeightPack, start_idx: int) -> None:
+        raise NotImplementedError(
+            f"quantization method {self.method_name} is not supported to load offline quantized weight"
+        )
+
+    def load_weight_scale(self, weight_scale: torch.Tensor, weight_pack: WeightPack, start_idx: int) -> None:
+        raise NotImplementedError(
+            f"quantization method {self.method_name} is not supported to load offline quantized weight scale"
+        )
+
+    def load_weight_zero_point(self, weight_zero_point: torch.Tensor, weight_pack: WeightPack, start_idx: int) -> None:
+        raise NotImplementedError(
+            f"quantization method {self.method_name} is not supported to load offline quantized weight zero point"
+        )
diff --git a/lightllm/common/quantization/registry.py b/lightllm/common/quantization/registry.py
index 674a22b60..e9b407398 100644
--- a/lightllm/common/quantization/registry.py
+++ b/lightllm/common/quantization/registry.py
@@ -1,5 +1,4 @@
 from .quantize_method import QuantizationMethod
-from typing import Type
 
 
 class QuantMethodFactory:
@@ -17,9 +16,7 @@ def decorator(cls):
 
         return decorator
 
-    def get(self, key, *args, **kwargs) -> Type[QuantizationMethod]:
-        if key == "none":
-            return None
+    def get(self, key, *args, **kwargs) -> "QuantizationMethod":
         quant_method_class = self._quant_methods.get(key)
         if not quant_method_class:
             raise ValueError(f"QuantMethod '{key}' not supported.")
diff --git a/lightllm/common/quantization/torchao_quant.py b/lightllm/common/quantization/torchao_quant.py
index ba4115b1d..d1db65b35 100644
--- a/lightllm/common/quantization/torchao_quant.py
+++ b/lightllm/common/quantization/torchao_quant.py
@@ -5,8 +5,7 @@
 import torch.nn.functional as F
 from typing import TYPE_CHECKING, Optional
 
-if TYPE_CHECKING:
-    from lightllm.common.basemodel.layer_weights.meta_weights.mm_weight.mm_weight import MMWeightPack
+from .quantize_method import WeightPack
 
 try:
     HAS_TORCH_AO = True
@@ -34,17 +33,17 @@ def __init__(self):
         assert TORCH_VERSION_AT_LEAST_2_4, "torchao requires torch >=2.4"
         self.quant_func = None
 
-    def quantize(self, weight: torch.Tensor):
+    def quantize(self, weight: torch.Tensor, offset: int = 0) -> WeightPack:
         """ """
         dummy_linear = torch.nn.Linear(weight.shape[1], weight.shape[0], bias=False)
         dummy_linear.weight = torch.nn.Parameter(weight.cuda(self.device_id_))
         quantize_(dummy_linear, self.quant_func)
-        return dummy_linear.weight, None, None
+        return WeightPack(weight=dummy_linear.weight, weight_scale=None, weight_zero_point=None)
 
     def apply(
         self,
         input_tensor: torch.Tensor,
-        weight_pack: "MMWeightPack",
+        weight_pack: WeightPack,
         out: Optional[torch.Tensor] = None,
         workspace: Optional[torch.Tensor] = None,
         use_custom_tensor_mananger: bool = True,
diff --git a/lightllm/common/quantization/triton_quant/triton_quant.py b/lightllm/common/quantization/triton_quant/triton_quant.py
index 410f925a5..9f6a7bee2 100644
--- a/lightllm/common/quantization/triton_quant/triton_quant.py
+++ b/lightllm/common/quantization/triton_quant/triton_quant.py
@@ -7,8 +7,7 @@
 from .fp8.fp8act_quant_kernel import per_token_group_quant_fp8
 from typing import TYPE_CHECKING, Optional
 
-if TYPE_CHECKING:
-    from lightllm.common.basemodel.layer_weights.meta_weights.mm_weight.mm_weight import MMWeightPack
+from lightllm.common.quantization.quantize_method import WeightPack
 
 
 class TritonBaseQuantizationMethod(QuantizationMethod):
@@ -18,16 +17,17 @@ def __init__(self):
 
         self.cache_manager = g_cache_manager
 
-    def quantize(self, weight: torch.Tensor):
-        pass
+    def quantize(self, weight: torch.Tensor, output: WeightPack, offset: int = 0) -> WeightPack:
+        raise NotImplementedError("Not implemented")
 
     def apply(
         self,
         input_tensor: torch.Tensor,
-        weight_pack: "MMWeightPack",
+        weight_pack: WeightPack,
         out: Optional[torch.Tensor] = None,
         workspace: Optional[torch.Tensor] = None,
         use_custom_tensor_mananger: bool = True,
+        bias: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         raise NotImplementedError("Not implemented")
 
@@ -44,17 +44,18 @@ def __init__(self):
         self.has_weight_scale = True
         self.has_weight_zero_point = False
 
-    def quantize(self, weight: torch.Tensor):
+    def quantize(self, weight: torch.Tensor, output: WeightPack, offset: int = 0) -> None:
         # TODO block-wise quant kernel
-        pass
+        raise NotImplementedError("Not implemented")
 
     def apply(
         self,
         input_tensor: torch.Tensor,
-        weight_pack: "MMWeightPack",
+        weight_pack: WeightPack,
         out: Optional[torch.Tensor] = None,
         workspace: Optional[torch.Tensor] = None,
         use_custom_tensor_mananger: bool = True,
+        bias: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         qweight = weight_pack.weight
         weight_scale = weight_pack.weight_scale
@@ -83,3 +84,29 @@ def apply(
             dtype=input_tensor.dtype,
         )
         return out
+
+    def create_weight(
+        self, out_dim: int, in_dim: int, dtype: torch.dtype, device_id: int, num_experts: int = 1
+    ) -> WeightPack:
+        expert_prefix = (num_experts,) if num_experts > 1 else ()
+        weight = torch.empty(expert_prefix + (out_dim, in_dim), dtype=torch.float8_e4m3fn).cuda(device_id)
+        weight_scale = torch.empty(
+            expert_prefix + (out_dim // self.block_size, in_dim // self.block_size), dtype=torch.float32
+        ).cuda(device_id)
+        return WeightPack(weight=weight, weight_scale=weight_scale)
+
+    def load_weight(self, weight: torch.Tensor, weight_pack: WeightPack, start_idx: int) -> None:
+        weight_pack.weight[start_idx : start_idx + weight.shape[0]].copy_(weight)
+        return
+
+    def load_weight_scale(self, weight_scale: torch.Tensor, weight_pack: WeightPack, start_idx: int) -> None:
+        weight_pack.weight_scale[
+            start_idx // self.block_size : start_idx + weight_scale.shape[0] // self.block_size
+        ].copy_(weight_scale)
+        return
+
+    def load_weight_zero_point(self, weight_zero_point: torch.Tensor, weight_pack: WeightPack, start_idx: int) -> None:
+        weight_pack.weight_zero_point[
+            start_idx // self.block_size : start_idx + weight_zero_point.shape[0] // self.block_size
+        ].copy_(weight_zero_point)
+        return
diff --git a/lightllm/common/quantization/w8a8_quant.py b/lightllm/common/quantization/w8a8_quant.py
index 31004de4e..1728e799d 100644
--- a/lightllm/common/quantization/w8a8_quant.py
+++ b/lightllm/common/quantization/w8a8_quant.py
@@ -11,8 +11,8 @@
 from lightllm.utils.vllm_utils import HAS_VLLM, vllm_ops, cutlass_scaled_mm
 from lightllm.utils.light_utils import HAS_LIGHTLLM_KERNEL, light_ops
 
-if TYPE_CHECKING:
-    from lightllm.common.basemodel.layer_weights.meta_weights.mm_weight.mm_weight import MMWeightPack
+
+from .quantize_method import WeightPack
 
 if HAS_LIGHTLLM_KERNEL:
 
@@ -38,16 +38,17 @@ def __init__(self):
 
         self.cache_manager = g_cache_manager
 
-    def quantize(self, weight: torch.Tensor):
-        pass
+    def quantize(self, weight: torch.Tensor, output: WeightPack, offset: int = 0) -> None:
+        raise NotImplementedError("Not implemented")
 
     def apply(
         self,
         input_tensor: torch.Tensor,
-        weight_pack: "MMWeightPack",
+        weight_pack: WeightPack,
         out: Optional[torch.Tensor] = None,
         workspace: Optional[torch.Tensor] = None,
         use_custom_tensor_mananger: bool = True,
+        bias: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         raise NotImplementedError("Not implemented")
 
@@ -55,6 +56,11 @@ def apply(
     def method_name(self):
         return "w8a8-base"
 
+    def create_weight(
+        self, out_dim: int, in_dim: int, dtype: torch.dtype, device_id: int, num_experts: int = 1
+    ) -> WeightPack:
+        raise NotImplementedError("Not implemented")
+
 
 @QUANTMETHODS.register(["vllm-w8a8", "w8a8"])
 class w8a8QuantizationMethod(BaseQuantizationMethod):
@@ -63,27 +69,27 @@ def __init__(self):
         self.has_weight_scale = True
         self.has_weight_zero_point = False
 
-    def quantize(self, weight: torch.Tensor):
-        if isinstance(weight, tuple):
-            return (weight[0].transpose(0, 1).cuda(self.device_id_),) + weight[1:]
-        weight = weight.float()
+    def quantize(self, weight: torch.Tensor, output: WeightPack, offset: int = 0) -> None:
+        weight = weight.float().cuda(self.device_id_)
         scale = weight.abs().max(dim=-1)[0] / 127
-        weight = weight.transpose(0, 1) / scale.reshape(1, -1)
+        weight = weight / scale.reshape(-1, 1)
         weight = torch.round(weight.clamp(min=-128, max=127)).to(dtype=torch.int8)
-        return weight.cuda(self.device_id_), scale.cuda(self.device_id_), None
+        output.weight[offset : offset + weight.shape[0]].copy_(weight)
+        output.weight_scale[offset : offset + weight.shape[0]].copy_(scale)
+        return
 
     def apply(
         self,
         input_tensor: torch.Tensor,
-        weight_pack: "MMWeightPack",
+        weight_pack: WeightPack,
         out: Optional[torch.Tensor] = None,
         workspace: Optional[torch.Tensor] = None,
         use_custom_tensor_mananger: bool = True,
+        bias: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         input_scale = None
-        qweight = weight_pack.weight
+        qweight = weight_pack.weight.t()
         weight_scale = weight_pack.weight_scale
-        bias = weight_pack.bias
         input_scale = None  # dynamic quantization for input tensor
         x_q, x_scale, x_zp = vllm_ops.scaled_int8_quant(input_tensor, scale=input_scale, azp=None, symmetric=True)
         m = input_tensor.shape[0]
@@ -100,6 +106,14 @@ def apply(
     def method_name(self):
         return "vllm-w8a8"
 
+    def create_weight(
+        self, out_dim: int, in_dim: int, dtype: torch.dtype, device_id: int, num_experts: int = 1
+    ) -> WeightPack:
+        expert_prefix = (num_experts,) if num_experts > 1 else ()
+        weight = torch.empty(expert_prefix + (out_dim, in_dim), dtype=torch.int8).cuda(device_id)
+        weight_scale = torch.empty(expert_prefix + (out_dim,), dtype=torch.float32).cuda(device_id)
+        return WeightPack(weight=weight, weight_scale=weight_scale)
+
 
 @QUANTMETHODS.register(["vllm-fp8w8a8", "fp8w8a8"])
 class FP8w8a8QuantizationMethod(BaseQuantizationMethod):
@@ -109,19 +123,20 @@ def __init__(self):
         self.has_weight_scale = True
         self.has_weight_zero_point = False
 
-    def quantize(self, weight: torch.Tensor):
+    def quantize(self, weight: torch.Tensor, output: WeightPack, offset: int = 0) -> None:
         if self.is_moe:
-            return self.quantize_moe(weight)
+            return self.quantize_moe(weight, output, offset)
         qweight, weight_scale = scaled_fp8_quant(
-            weight.contiguous().cuda(self.device_id_), scale=None, use_per_token_if_dynamic=True
+            weight.cuda(self.device_id_), scale=None, use_per_token_if_dynamic=True
         )
-        return qweight.transpose(0, 1), weight_scale, None
+        output.weight[offset : offset + qweight.shape[0], :].copy_(qweight)
+        output.weight_scale[offset : offset + weight_scale.shape[0]].copy_(weight_scale.view(-1))
+        return
 
-    def quantize_moe(self, weight: torch.Tensor):
+    def quantize_moe(self, weight: torch.Tensor) -> WeightPack:
         num_experts = weight.shape[0]
-        qweights = []
-        weight_scales = []
         qweights = torch.empty_like(weight, dtype=torch.float8_e4m3fn).cuda(self.device_id_)
+        weight_scales = []
         for i in range(num_experts):
             qweight, weight_scale = scaled_fp8_quant(
                 weight[i].contiguous().cuda(self.device_id_), scale=None, use_per_token_if_dynamic=True
@@ -129,19 +144,19 @@ def quantize_moe(self, weight: torch.Tensor):
             qweights[i] = qweight
             weight_scales.append(weight_scale)
         weight_scale = torch.stack(weight_scales, dim=0).contiguous()
-        return qweights, weight_scale, None
+        return WeightPack(weight=qweights, weight_scale=weight_scale)
 
     def apply(
         self,
         input_tensor: torch.Tensor,
-        weight_pack: "MMWeightPack",
+        weight_pack: WeightPack,
         out: Optional[torch.Tensor] = None,
         workspace: Optional[torch.Tensor] = None,
         use_custom_tensor_mananger: bool = True,
+        bias: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
-        qweight = weight_pack.weight
+        qweight = weight_pack.weight.t()
         weight_scale = weight_pack.weight_scale
-        bias = weight_pack.bias
         x_q, x_scale = scaled_fp8_quant(input_tensor, scale=None, scale_ub=None, use_per_token_if_dynamic=True)
         m = input_tensor.shape[0]
         n = qweight.shape[1]
@@ -160,6 +175,14 @@ def apply(
     def method_name(self):
         return "vllm-fp8w8a8"
 
+    def create_weight(
+        self, out_dim: int, in_dim: int, dtype: torch.dtype, device_id: int, num_experts: int = 1
+    ) -> WeightPack:
+        expert_prefix = (num_experts,) if num_experts > 1 else ()
+        weight = torch.empty(expert_prefix + (out_dim, in_dim), dtype=torch.float8_e4m3fn).cuda(device_id)
+        weight_scale = torch.empty(expert_prefix + (out_dim,), dtype=torch.float32).cuda(device_id)
+        return WeightPack(weight=weight, weight_scale=weight_scale)
+
 
 @QUANTMETHODS.register(["vllm-fp8w8a8-b128", "fp8w8a8-b128"])
 class FP8w8a8B128QuantizationMethod(BaseQuantizationMethod):
@@ -170,21 +193,26 @@ def __init__(self):
         self.has_weight_scale = True
         self.has_weight_zero_point = False
 
-    def quantize(self, weight: torch.Tensor):
+    def quantize(self, weight: torch.Tensor, output: WeightPack, offset: int = 0) -> None:
+        from lightllm.common.quantization.triton_quant.fp8.fp8w8a8_block_quant_kernel import weight_quant
 
-        raise Exception("Not implemented")
+        device = output.weight.device
+        weight, scale = weight_quant(weight.cuda(device), self.block_size)
+        output.weight[offset : offset + weight.shape[0], :].copy_(weight)
+        output.weight_scale[offset // self.block_size : offset + weight.shape[0] // self.block_size].copy_(scale)
+        return
 
     def apply(
         self,
         input_tensor: torch.Tensor,
-        weight_pack: "MMWeightPack",
+        weight_pack: WeightPack,
         out: Optional[torch.Tensor] = None,
         workspace: Optional[torch.Tensor] = None,
         use_custom_tensor_mananger: bool = True,
+        bias: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
-        qweight = weight_pack.weight
-        weight_scale = weight_pack.weight_scale
-        bias = weight_pack.bias
+        qweight = weight_pack.weight.t()
+        weight_scale = weight_pack.weight_scale.t()
         input_scale = None  # dynamic quantization for input tensor
         m, k = input_tensor.shape
         n = qweight.shape[1]
@@ -213,3 +241,13 @@ def apply(
     @property
     def method_name(self):
         return "vllm-fp8w8a8-b128"
+
+    def create_weight(
+        self, out_dim: int, in_dim: int, dtype: torch.dtype, device_id: int, num_experts: int = 1
+    ) -> WeightPack:
+        expert_prefix = (num_experts,) if num_experts > 1 else ()
+        weight = torch.empty(expert_prefix + (out_dim, in_dim), dtype=torch.float8_e4m3fn).cuda(device_id)
+        weight_scale = torch.empty(
+            expert_prefix + (out_dim // self.block_size, in_dim // self.block_size), dtype=torch.float32
+        ).cuda(device_id)
+        return WeightPack(weight=weight, weight_scale=weight_scale)
diff --git a/lightllm/models/gpt_oss/layer_weights/transformer_layer_weight.py b/lightllm/models/gpt_oss/layer_weights/transformer_layer_weight.py
index c5c14b08e..5fb85aa1c 100644
--- a/lightllm/models/gpt_oss/layer_weights/transformer_layer_weight.py
+++ b/lightllm/models/gpt_oss/layer_weights/transformer_layer_weight.py
@@ -2,7 +2,9 @@
 import torch
 import numpy as np
 
-from lightllm.common.basemodel.layer_weights.meta_weights.gpt_oss_fused_moe_weight_tp import GPTOSSFusedMoeWeightTP
+from lightllm.common.basemodel.layer_weights.meta_weights.fused_moe.gpt_oss_fused_moe_weight_tp import (
+    GPTOSSFusedMoeWeightTP,
+)
 from lightllm.common.basemodel.layer_weights.meta_weights.mm_weight import ROWMMWeight
 from lightllm.common.basemodel.layer_weights.meta_weights import TpAttSinkWeight
 from lightllm.models.llama.layer_weights.transformer_layer_weight import LlamaTransformerLayerWeight
diff --git a/lightllm/models/llama/layer_weights/transformer_layer_weight.py b/lightllm/models/llama/layer_weights/transformer_layer_weight.py
index 197116d99..a455a01f9 100644
--- a/lightllm/models/llama/layer_weights/transformer_layer_weight.py
+++ b/lightllm/models/llama/layer_weights/transformer_layer_weight.py
@@ -23,11 +23,16 @@ def _init_weight(self):
         self._init_norm()
 
     def _parse_config(self):
+        self.tp_q_head_num_ = self.network_config_["num_attention_heads"] // self.tp_world_size_
+        self.tp_k_head_num_ = max(self.network_config_["num_key_value_heads"] // self.tp_world_size_, 1)
+        self.tp_v_head_num_ = self.tp_k_head_num_
+        self.tp_o_head_num_ = self.tp_q_head_num_
+        head_dim = self.network_config_["hidden_size"] // self.network_config_["num_attention_heads"]
+        self.head_dim = self.network_config_.get("head_dim", head_dim)
+        assert (self.tp_k_head_num_ * self.tp_world_size_) % self.network_config_["num_key_value_heads"] == 0
         self.n_embed = self.network_config_["hidden_size"]
-        self.n_head = self.network_config_["num_attention_heads"]
         self.n_inter = self.network_config_["intermediate_size"]
-        self.n_kv_head = self.network_config_["num_key_value_heads"]
-        self.head_dim = self.network_config_.get("head_dim", self.n_embed // self.n_head)
+        self.n_head = self.network_config_["num_attention_heads"]
 
     def _init_weight_names(self):
         self._q_weight_name = f"model.layers.{self.layer_num_}.self_attn.q_proj.weight"
@@ -56,49 +61,57 @@ def _init_weight_names(self):
         self._ffn_norm_bias_name = None
 
     def _init_qkv(self):
+        in_dim = self.n_embed
+        q_out_dim = self.tp_q_head_num_ * self.head_dim
+        k_out_dim = self.tp_k_head_num_ * self.head_dim
+        v_out_dim = self.tp_v_head_num_ * self.head_dim
         self.q_proj = ROWMMWeight(
+            in_dim=in_dim,
+            out_dims=[q_out_dim],
             weight_names=self._q_weight_name,
             data_type=self.data_type_,
             bias_names=self._q_bias_name,
-            quant_cfg=self.quant_cfg,
-            layer_num=self.layer_num_,
-            name="q_proj",
+            quant_method=self.get_quant_method("q_proj"),
         )
         self.kv_proj = ROWMMWeight(
+            in_dim=in_dim,
+            out_dims=[k_out_dim, v_out_dim],
             weight_names=[self._k_weight_name, self._v_weight_name],
             data_type=self.data_type_,
             bias_names=[self._k_bias_name, self._v_bias_name],
-            quant_cfg=self.quant_cfg,
-            layer_num=self.layer_num_,
-            name="kv_proj",
+            quant_method=self.get_quant_method("kv_proj"),
         )
 
     def _init_o(self):
+        in_dim = self.tp_o_head_num_ * self.head_dim
+        out_dim = self.n_embed
         self.o_proj = COLMMWeight(
+            in_dim=in_dim,
+            out_dims=[out_dim],
             weight_names=self._o_weight_name,
             data_type=self.data_type_,
             bias_names=self._o_bias_name,
-            quant_cfg=self.quant_cfg,
-            layer_num=self.layer_num_,
-            name="o_proj",
+            quant_method=self.get_quant_method("o_proj"),
         )
 
     def _init_ffn(self):
+        in_dim = self.n_embed
+        out_dim = self.n_inter // self.tp_world_size_
         self.gate_up_proj = ROWMMWeight(
+            in_dim=in_dim,
+            out_dims=[out_dim, out_dim],
             weight_names=[self._gate_weight_name, self._up_weight_name],
             data_type=self.data_type_,
             bias_names=[self._gate_bias_name, self._up_bias_name],
-            quant_cfg=self.quant_cfg,
-            layer_num=self.layer_num_,
-            name="gate_up_proj",
+            quant_method=self.get_quant_method("gate_up_proj"),
         )
         self.down_proj = COLMMWeight(
+            in_dim=out_dim,
+            out_dims=[in_dim],
             weight_names=self._down_weight_name,
             data_type=self.data_type_,
             bias_names=self._down_bias_name,
-            quant_cfg=self.quant_cfg,
-            layer_num=self.layer_num_,
-            name="down_proj",
+            quant_method=self.get_quant_method("down_proj"),
         )
 
     def _init_norm(self):
diff --git a/lightllm/models/mixtral/layer_weights/transformer_layer_weight.py b/lightllm/models/mixtral/layer_weights/transformer_layer_weight.py
index 39e28d465..cc125f926 100644
--- a/lightllm/models/mixtral/layer_weights/transformer_layer_weight.py
+++ b/lightllm/models/mixtral/layer_weights/transformer_layer_weight.py
@@ -61,6 +61,7 @@ def _init_moe(self):
                 layer_num=self.layer_num_,
                 quant_cfg=self.quant_cfg,
                 num_fused_shared_experts=0,
+                hidden_size=self.network_config_.get("hidden_size"),
             )
         else:
             raise ValueError(f"Unsupported moe mode: {moe_mode}")
diff --git a/lightllm/models/qwen2/layer_weights/transformer_layer_weight.py b/lightllm/models/qwen2/layer_weights/transformer_layer_weight.py
index 9c3e2cb3a..74cf6c600 100644
--- a/lightllm/models/qwen2/layer_weights/transformer_layer_weight.py
+++ b/lightllm/models/qwen2/layer_weights/transformer_layer_weight.py
@@ -11,15 +11,6 @@ def _init_weight_names(self):
         self._k_bias_name = f"model.layers.{self.layer_num_}.self_attn.k_proj.bias"
         self._v_bias_name = f"model.layers.{self.layer_num_}.self_attn.v_proj.bias"
 
-    def _parse_config(self):
-        self.tp_q_head_num_ = self.network_config_["num_attention_heads"] // self.tp_world_size_
-        self.tp_k_head_num_ = max(self.network_config_["num_key_value_heads"] // self.tp_world_size_, 1)
-        self.tp_v_head_num_ = self.tp_k_head_num_
-        self.tp_o_head_num_ = self.tp_q_head_num_
-        head_dim = self.network_config_["hidden_size"] // self.network_config_["num_attention_heads"]
-        self.head_dim = self.network_config_.get("head_dim", head_dim)
-        assert (self.tp_k_head_num_ * self.tp_world_size_) % self.network_config_["num_key_value_heads"] == 0
-
     def _repeat_weight(self, name, weights):
         # for tp_world_size_ > num_key_value_heads
         if name not in weights:
diff --git a/lightllm/models/qwen3_moe/layer_weights/transformer_layer_weight.py b/lightllm/models/qwen3_moe/layer_weights/transformer_layer_weight.py
index 486f4d696..17023d0cb 100644
--- a/lightllm/models/qwen3_moe/layer_weights/transformer_layer_weight.py
+++ b/lightllm/models/qwen3_moe/layer_weights/transformer_layer_weight.py
@@ -53,10 +53,11 @@ def _init_weight(self):
     def _init_moe(self):
         moe_intermediate_size = self.network_config_["moe_intermediate_size"]
         self.moe_gate = ROWMMWeight(
+            in_dim=self.network_config_["hidden_size"],
+            out_dims=[self.n_routed_experts],
             weight_names=f"model.layers.{self.layer_num_}.mlp.gate.weight",
             data_type=self.data_type_,
-            layer_num=self.layer_num_,
-            name="moe_gate",
+            quant_method=None,
             tp_rank=0,
             tp_world_size=1,
         )
diff --git a/lightllm/models/starcoder/layer_weights/pre_and_post_layer_weight.py b/lightllm/models/starcoder/layer_weights/pre_and_post_layer_weight.py
index 329a0245f..34e74f136 100644
--- a/lightllm/models/starcoder/layer_weights/pre_and_post_layer_weight.py
+++ b/lightllm/models/starcoder/layer_weights/pre_and_post_layer_weight.py
@@ -11,6 +11,7 @@ class StarcoderPreAndPostLayerWeight(PreAndPostLayerWeight):
     def __init__(self, data_type, network_config):
         super().__init__(data_type, network_config)
 
+    def _create_weight(self):
         self.wte_weight_ = EmbeddingWeight(
             weight_name="transformer.wte.weight",
             data_type=self.data_type_,
diff --git a/lightllm/models/starcoder2/layer_weights/pre_and_post_layer_weight.py b/lightllm/models/starcoder2/layer_weights/pre_and_post_layer_weight.py
index 6ee188537..d08c27cc7 100644
--- a/lightllm/models/starcoder2/layer_weights/pre_and_post_layer_weight.py
+++ b/lightllm/models/starcoder2/layer_weights/pre_and_post_layer_weight.py
@@ -1,3 +1,4 @@
+import torch
 import numpy as np
 from lightllm.common.basemodel import PreAndPostLayerWeight
 from lightllm.common.basemodel.layer_weights.meta_weights import EmbeddingWeight, LMHeadWeight, NoTpNormWeight
diff --git a/lightllm/models/vit/layer_weights/pre_and_post_layer_weight.py b/lightllm/models/vit/layer_weights/pre_and_post_layer_weight.py
index e2bed1036..79dc9d95c 100644
--- a/lightllm/models/vit/layer_weights/pre_and_post_layer_weight.py
+++ b/lightllm/models/vit/layer_weights/pre_and_post_layer_weight.py
@@ -13,6 +13,38 @@ def __init__(self, data_type, network_config):
         self.image_size = self.network_config_["image_size"]
         self.patch_size = self.network_config_["patch_size"]
         self.llm_hidden_size = self.network_config_["llm_hidden_size"]
+        self._create_weight()
+        return
+
+    def _create_weight(self):
+        split_indexes = np.linspace(0, self.embed_dim, self.tp_world_size_ + 1, dtype=np.int64)
+        split_start = split_indexes[self.tp_rank_]
+        split_end = split_indexes[self.tp_rank_ + 1]
+        split_embed_dim = split_end - split_start
+
+        # Pre-allocate memory for vision model weights
+        self.class_embedding = torch.empty((1, 1, split_embed_dim), dtype=self.data_type_).cuda()
+        self.position_embedding = torch.empty(
+            (1, 197, split_embed_dim), dtype=self.data_type_
+        ).cuda()  # 197 = (224//16)^2 + 1
+        self.patch_embedding_weight_ = torch.empty(
+            (split_embed_dim, 3, self.patch_size, self.patch_size), dtype=self.data_type_
+        ).cuda()
+        self.patch_embedding_bias_ = torch.empty(split_embed_dim, dtype=self.data_type_).cuda()
+
+        # Pre-allocate memory for adapter weights
+        self.layernorm_weight_ = torch.empty(self.embed_dim, dtype=self.data_type_).cuda()
+        self.layernorm_bias_ = torch.empty(self.embed_dim, dtype=self.data_type_).cuda()
+
+        split_indexes_llm = np.linspace(0, self.llm_hidden_size, self.tp_world_size_ + 1, dtype=np.int64)
+        split_start_llm = split_indexes_llm[self.tp_rank_]
+        split_end_llm = split_indexes_llm[self.tp_rank_ + 1]
+        split_llm_hidden_size = split_end_llm - split_start_llm
+
+        self.mlp1_1_weight_ = torch.empty((self.llm_hidden_size, split_llm_hidden_size), dtype=self.data_type_).cuda()
+        self.mlp1_1_bias_ = torch.empty(split_llm_hidden_size, dtype=self.data_type_).cuda()
+        self.mlp1_3_weight_ = torch.empty((split_llm_hidden_size, self.llm_hidden_size), dtype=self.data_type_).cuda()
+        self.mlp1_3_bias_ = torch.empty(self.llm_hidden_size, dtype=self.data_type_).cuda()
         return
 
     def _cuda(self, cpu_tensor):
@@ -40,40 +72,38 @@ def load_hf_weights(self, weights):
         split_start = split_indexes[self.tp_rank_]
         split_end = split_indexes[self.tp_rank_ + 1]
         if "vision_model.embeddings.class_embedding" in weights:
-            self.class_embedding = self._cuda(
-                weights["vision_model.embeddings.class_embedding"][:, :, split_start:split_end]
-            )
+            self.class_embedding.copy_(weights["vision_model.embeddings.class_embedding"][:, :, split_start:split_end])
         if "vision_model.embeddings.position_embedding" in weights:
-            self.position_embedding = self._cuda(
+            self.position_embedding.copy_(
                 weights["vision_model.embeddings.position_embedding"][:, :, split_start:split_end]
             )
         if "vision_model.embeddings.patch_embedding.weight" in weights:
-            self.patch_embedding_weight_ = self._cuda(
+            self.patch_embedding_weight_.copy_(
                 weights["vision_model.embeddings.patch_embedding.weight"][split_start:split_end, :, :, :]
             )
         if "vision_model.embeddings.patch_embedding.bias" in weights:
-            self.patch_embedding_bias_ = self._cuda(
+            self.patch_embedding_bias_.copy_(
                 weights["vision_model.embeddings.patch_embedding.bias"][split_start:split_end]
             )
 
         if "mlp1.0.weight" in weights:
-            self.layernorm_weight_ = self._cuda(weights["mlp1.0.weight"])
+            self.layernorm_weight_.copy_(weights["mlp1.0.weight"])
         if "mlp1.0.bias" in weights:
-            self.layernorm_bias_ = self._cuda(weights["mlp1.0.bias"])
+            self.layernorm_bias_.copy_(weights["mlp1.0.bias"])
 
         split_indexes = np.linspace(0, self.llm_hidden_size, self.tp_world_size_ + 1, dtype=np.int64)
         split_start = split_indexes[self.tp_rank_]
         split_end = split_indexes[self.tp_rank_ + 1]
 
         if "mlp1.1.weight" in weights:
-            self.mlp1_1_weight_ = self._cuda(weights["mlp1.1.weight"][split_start:split_end, :]).t()
+            self.mlp1_1_weight_.copy_(weights["mlp1.1.weight"][split_start:split_end, :].t())
         if "mlp1.1.bias" in weights:
-            self.mlp1_1_bias_ = self._cuda(weights["mlp1.1.bias"][split_start:split_end])
+            self.mlp1_1_bias_.copy_(weights["mlp1.1.bias"][split_start:split_end])
 
         if "mlp1.3.weight" in weights:
-            self.mlp1_3_weight_ = self._cuda(weights["mlp1.3.weight"][:, split_start:split_end]).t()
+            self.mlp1_3_weight_.copy_(weights["mlp1.3.weight"][:, split_start:split_end].t())
         if "mlp1.3.bias" in weights:
-            self.mlp1_3_bias_ = self._cuda(weights["mlp1.3.bias"])
+            self.mlp1_3_bias_.copy_(weights["mlp1.3.bias"])
 
         return
 
diff --git a/lightllm/server/router/model_infer/mode_backend/redundancy_expert_manager.py b/lightllm/server/router/model_infer/mode_backend/redundancy_expert_manager.py
index 811d39a72..e3a71379d 100644
--- a/lightllm/server/router/model_infer/mode_backend/redundancy_expert_manager.py
+++ b/lightllm/server/router/model_infer/mode_backend/redundancy_expert_manager.py
@@ -8,10 +8,10 @@
 import json
 from typing import List
 from lightllm.common.basemodel.basemodel import TpPartBaseModel
-from lightllm.common.basemodel.layer_weights.meta_weights.fused_moe_weight_ep_redundancy import (
+from lightllm.common.basemodel.layer_weights.meta_weights.fused_moe.fused_moe_weight_ep_redundancy import (
     FusedMoeWeightEPAutoRedundancy,
 )
-from lightllm.common.basemodel.layer_weights.meta_weights.fused_moe_weight_ep import FusedMoeWeightEP
+from lightllm.common.basemodel.layer_weights.meta_weights.fused_moe.fused_moe_weight_ep import FusedMoeWeightEP
 from lightllm.utils.envs_utils import get_env_start_args, get_redundancy_expert_update_interval
 from lightllm.utils.envs_utils import get_redundancy_expert_update_max_load_count
 from lightllm.utils.envs_utils import get_redundancy_expert_num

From bc0301e159edf8de3e66d960b1a9b764dde759b7 Mon Sep 17 00:00:00 2001
From: shihaobai <1798930569@qq.com>
Date: Sun, 11 Jan 2026 09:25:21 +0000
Subject: [PATCH 02/65] refactor norm and add platform

---
 lightllm/common/basemodel/basemodel.py        |  21 ++-
 .../layer_weights/meta_weights/__init__.py    |   2 +-
 .../layer_weights/meta_weights/base_weight.py |  33 +---
 .../layer_weights/meta_weights/norm_weight.py | 152 +++++++++++++-----
 .../layer_weights/meta_weights/platform_op.py |  52 ++++++
 lightllm/server/api_cli.py                    |  12 ++
 lightllm/utils/device_utils.py                |  41 +++++
 7 files changed, 233 insertions(+), 80 deletions(-)
 create mode 100644 lightllm/common/basemodel/layer_weights/meta_weights/platform_op.py

diff --git a/lightllm/common/basemodel/basemodel.py b/lightllm/common/basemodel/basemodel.py
index 25171df2a..435f39a88 100755
--- a/lightllm/common/basemodel/basemodel.py
+++ b/lightllm/common/basemodel/basemodel.py
@@ -103,20 +103,15 @@ def __init__(self, kvargs):
         self._verify_params()
         self._init_quant()
 
-        # 更连续的显存分配可以有更好的性能
-        if self.max_total_token_num is None:
-            self._init_weights()
-            self._init_mem_manager()
-        else:
-            self._init_mem_manager()
-            self._init_weights()
-
+        self._init_weights()
+        self._init_mem_manager()
         self._init_kv_move_buffer()
         self._check_mem_size()
         self._init_req_manager()
         self._init_infer_layer()
         self._init_some_value()
         self._init_custom()
+        self._load_hf_weights()
         # wait必须在init cudagraph 之前，避免错误捕获
         self._wait_other_modules_ready()
 
@@ -181,6 +176,16 @@ def _init_weights(self, start_layer_index=0):
         ]
         return
 
+    def _load_hf_weights(self):
+        load_hf_weights(
+            self.data_type,
+            weight_dir=self.weight_dir_,
+            pre_post_layer=self.pre_post_weight,
+            transformer_layer_list=self.trans_layers_weight,
+            weight_dict=self.weight_dict,
+        )
+        return
+
     def _init_mem_manager(self):
         assert self.config["num_attention_heads"] % self.tp_world_size_ == 0
         self.mem_manager: MemoryManager = select_mem_manager_class()(
diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/__init__.py b/lightllm/common/basemodel/layer_weights/meta_weights/__init__.py
index 72e0034cb..109777401 100644
--- a/lightllm/common/basemodel/layer_weights/meta_weights/__init__.py
+++ b/lightllm/common/basemodel/layer_weights/meta_weights/__init__.py
@@ -5,7 +5,7 @@
     COLMMWeight,
     ROWBMMWeight,
 )
-from .norm_weight import NoTpGEMMANormWeight, TpVitPadNormWeight, NoTpNormWeight, TpHeadNormWeight
+from .norm_weight import TpRMSNormWeight, RMSNormWeight, LayerNormWeight
 from .embedding_weight import EmbeddingWeight, LMHeadWeight, NoTpPosEmbeddingWeight
 from .att_sink_weight import TpAttSinkWeight
 from .fused_moe.fused_moe_weight_tp import create_tp_moe_wegiht_obj
diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/base_weight.py b/lightllm/common/basemodel/layer_weights/meta_weights/base_weight.py
index 2cd8ea6ae..b17da6682 100644
--- a/lightllm/common/basemodel/layer_weights/meta_weights/base_weight.py
+++ b/lightllm/common/basemodel/layer_weights/meta_weights/base_weight.py
@@ -13,7 +13,7 @@ def load_hf_weights(self, weights):
         pass
 
     @abstractmethod
-    def verify_load(self) -> bool:
+    def _create_weight(self):
         pass
 
 
@@ -27,32 +27,5 @@ def __init__(self, tp_rank: int = None, tp_world_size: int = None, data_type: to
     def load_hf_weights(self, weights):
         raise NotImplementedError("load_hf_weights must implement this method")
 
-    def verify_load(self) -> bool:
-        raise NotImplementedError("verify_load must implement this method")
-
-    def _get_head_tp_split_params(self, weight: torch.Tensor) -> Tuple[int, int]:
-        """
-        Docstring for _get_head_tp_split_params,
-        一个常用的tp 划分head获取head_index 范围的功能函数, 一些继承类可能会使用。
-        :param self: Description
-        :param weight: Description
-        :type weight: torch.Tensor
-        :return: Description
-        :rtype: Tuple[int, int]
-        """
-        assert weight.ndim == 2
-
-        all_head_num = weight.shape[0]
-        tp_head_num = all_head_num // self.tp_world_size_
-
-        if tp_head_num > 0:
-            start_head_index = self.tp_rank_ * tp_head_num
-            end_head_index = (self.tp_rank_ + 1) * tp_head_num
-        else:
-            # 当 tp_world_size 大于 all_head_num 时的特殊处理
-            scale_size = self.tp_world_size_ // all_head_num
-            assert self.tp_world_size_ % all_head_num == 0
-            start_head_index = self.tp_rank_ // scale_size
-            end_head_index = start_head_index + 1
-
-        return start_head_index, end_head_index
+    def _create_weight(self) -> bool:
+        raise NotImplementedError("create_weight must implement this method")
diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/norm_weight.py b/lightllm/common/basemodel/layer_weights/meta_weights/norm_weight.py
index 619158fa8..de13818a5 100644
--- a/lightllm/common/basemodel/layer_weights/meta_weights/norm_weight.py
+++ b/lightllm/common/basemodel/layer_weights/meta_weights/norm_weight.py
@@ -1,77 +1,147 @@
 import torch
-from typing import Optional
+from typing import Optional, Dict
 from .base_weight import BaseWeightTpl
-from lightllm.utils.dist_utils import get_current_device_id
+from lightllm.utils.dist_utils import get_current_device_id, get_current_rank_in_dp, get_dp_world_size
 from lightllm.common.basemodel.triton_kernel.rmsnorm import rmsnorm_forward
 from lightllm.common.basemodel.triton_kernel.layernorm import layernorm_forward
 from lightllm.utils.log_utils import init_logger
+from .platform_op import PlatformAwareOp
 
 logger = init_logger(__name__)
 
 
-class NormWeight(BaseWeightTpl):
-    def __init__(self, norm_dim: int, weight_name, data_type, bias_name=None):
+class RMSNormWeight(BaseWeightTpl, PlatformAwareOp):
+    def __init__(self, dim: int, weight_name: str, data_type: torch.dtype, bias_name: str = None):
         super().__init__()
-        self.norm_dim = norm_dim
+        self.dim = dim
+        self.weight_name = weight_name
+        self.data_type_ = data_type
+        self._create_weight()
+
+    def _create_weight(self):
+        self.weight: torch.Tensor = torch.nn.Parameter(
+            torch.empty(self.dim, dtype=self.data_type_, device=self.device_id_)
+        )
+
+    def load_hf_weights(self, weights: Dict[str, torch.Tensor]):
+        if self.weight_name in weights:
+            self.weight.copy_(weights[self.weight_name])
+
+    def _native_forward(
+        self, input: torch.Tensor, eps: float, out: Optional[torch.Tensor] = None, alloc_func=torch.empty
+    ) -> torch.Tensor:
+        assert input.ndim == 2 and self.weight.ndim == 1
+        assert input.shape[-1] == self.dim, f"Expected hidden_size to be {self.dim}, but found: {input.shape[-1]}"
+        x = input.to(torch.float32)
+        x_var = x
+        variance = x_var.pow(2).mean(dim=-1, keepdim=True)
+        x = x * torch.rsqrt(variance + eps)
+        x = (x * self.weight).to(self.data_type_)
+        if out is None:
+            out.copy_(x)
+            return out
+        return x
+
+    def _cuda_forward(
+        self, input: torch.Tensor, eps: float, out: Optional[torch.Tensor] = None, alloc_func=torch.empty
+    ) -> torch.Tensor:
+        assert input.ndim == 2 and self.weight.ndim == 1
+        if out is None:
+            out = alloc_func(input.shape, dtype=input.dtype, device=input.device)
+        return rmsnorm_forward(x=input, weight=self.weight, eps=eps, out=out)
+
+    def apply(
+        self, input: torch.Tensor, eps: float, out: Optional[torch.Tensor] = None, alloc_func=torch.empty
+    ) -> torch.Tensor:
+        return self._forward(input=input, eps=eps, out=out, alloc_func=alloc_func)
+
+
+class LayerNormWeight(BaseWeightTpl, PlatformAwareOp):
+    def __init__(self, dim: int, weight_name: str, data_type: torch.dtype, bias_name: str = None):
+        super().__init__()
+        self.dim = dim
         self.weight_name = weight_name
         self.bias_name = bias_name
         self.data_type_ = data_type
-        self.weight = None
-        self.bias = None
-        self.is_weight_ready = False
-        self.is_bias_ready = False
         self._create_weight()
 
     def _create_weight(self):
-        device = f"cuda:{get_current_device_id()}"
-        self.weight = torch.empty(self.norm_dim, dtype=self.data_type_, device=device)
-        self.bias = (
-            torch.empty(self.norm_dim, dtype=self.data_type_, device=device) if self.bias_name is not None else None
+        self.weight: torch.Tensor = torch.nn.Parameter(
+            torch.empty(self.dim, dtype=self.data_type_, device=self.device_id_)
+        )
+        self.bias: torch.Tensor = torch.nn.Parameter(
+            torch.empty(self.dim, dtype=self.data_type_, device=self.device_id_)
         )
 
-    def load_hf_weights(self, weights):
+    def load_hf_weights(self, weights: Dict[str, torch.Tensor]):
         if self.weight_name in weights:
             self.weight.copy_(weights[self.weight_name])
-            self.is_weight_ready = True
         if self.bias_name in weights:
             self.bias.copy_(weights[self.bias_name])
-            self.is_bias_ready = True
 
-    def verify_load(self):
-        return self.is_weight_ready and (self.bias_name is None or self.is_bias_ready)
+    def _native_forward(
+        self, input: torch.Tensor, eps: float, out: Optional[torch.Tensor] = None, alloc_func=torch.empty
+    ) -> torch.Tensor:
+        assert input.ndim == 2 and self.weight.ndim == 1
+        assert input.shape[-1] == self.dim, f"Expected hidden_size to be {self.dim}, but found: {input.shape[-1]}"
+        x = torch.nn.functional.layer_norm(
+            input, normalized_shape=[self.dim], weight=self.weight, bias=self.bias, eps=eps
+        )
+        if out is None:
+            out.copy_(x.to(self.data_type_))
+            return out
+        return x.to(self.data_type_)
 
-    def rmsnorm_forward(
+    def _cuda_forward(
         self, input: torch.Tensor, eps: float, out: Optional[torch.Tensor] = None, alloc_func=torch.empty
     ) -> torch.Tensor:
-        assert input.ndim in [2, 3] and self.weight.ndim == 1
-        assert self.bias is None
+        assert input.ndim == 2 and self.weight.ndim == 1
         if out is None:
             out = alloc_func(input.shape, dtype=input.dtype, device=input.device)
-        return rmsnorm_forward(x=input, weight=self.weight, eps=eps, out=out)
+        return layernorm_forward(x=input, weight=self.weight, bias=self.bias, eps=eps, out=out)
 
+    def apply(
+        self, input: torch.Tensor, eps: float, out: Optional[torch.Tensor] = None, alloc_func=torch.empty
+    ) -> torch.Tensor:
+        return self._forward(input=input, eps=eps, out=out, alloc_func=alloc_func)
 
-class GEMMANormWeight(NormWeight):
-    def __init__(self, norm_dim: int, weight_name, data_type, bias_name=None):
-        super().__init__(norm_dim, weight_name, data_type, bias_name)
 
-    def load_hf_weights(self, weights):
-        # TODO: 这里直接 +1 会不会导致精度问题? 计算时要求 (1.0 + weight.float()) ?
-        if self.weight_name in weights:
-            self.weight.copy_((weights[self.weight_name] + 1).to(self.data_type_))
-            self.is_weight_ready = True
+class TpRMSNormWeight(RMSNormWeight):
+    def __init__(self, dim: int, weight_name: str, data_type: torch.dtype, bias_name: str = None):
+        super().__init__(dim=dim, weight_name=weight_name, data_type=data_type, bias_name=bias_name)
+        self.tp_world_size_ = get_dp_world_size()
+        self.tp_rank_ = get_current_rank_in_dp()
+        self.dim = self._get_tp_padded_dim(dim=dim)
+        self.repeat_times_ = 1
 
+    def _get_tp_padded_dim(self, dim: int):
+        """
+        Get the padded dimension for the weight.
+        1. if dim is divisible by tp_world_size_, return dim
+        2. if dim is greater than tp_world_size_, return (dim + tp_world_size_ - 1) // tp_world_size_ * tp_world_size_
+        3. if dim is less than tp_world_size_, assert tp_world_size_ is divisible by dim, and return dim
+        """
+        if dim % self.tp_world_size_ == 0:
+            return dim // self.tp_world_size_
 
-class TpNormWeight(NormWeight):
-    def __init__(self, norm_dim: int, weight_name, data_type, bias_name=None):
-        super().__init__(norm_dim, weight_name, data_type, bias_name)
+        if dim > self.tp_world_size_:
+            return (dim + self.tp_world_size_ - 1) // self.tp_world_size_ * self.tp_world_size_
+        else:
+            assert (
+                self.tp_world_size_ % dim == 0
+            ), f"tp_world_size_ must be divisible by dim, but found: {self.tp_world_size_} % {dim}"
+            self.repeat_times_ = self.tp_world_size_ // dim
+            return dim * self.repeat_times_ // self.tp_world_size_
 
     def load_hf_weights(self, weights):
-        start = self.norm_dim * self.tp_rank_
-        end = self.norm_dim * (self.tp_rank_ + 1)
+        if self.weight_name in weights and self.weight is None:
+            t_weight = weights[self.weight_name]
+            hidden_size = t_weight.shape[0]
+            split_hidden_size = hidden_size // self.tp_world_size_
 
-        if self.weight_name in weights:
-            self.weight.copy_(weights[self.weight_name][start:end].to(self.data_type_))
-            self.is_weight_ready = True
-        if self.bias_name in weights:
-            self.bias.copy_(weights[self.bias_name][start:end].to(self.data_type_))
-            self.is_bias_ready = True
+            start = split_hidden_size * self.tp_rank_ // self.repeat_times_
+            end = min(split_hidden_size * (self.tp_rank_ + 1) // self.repeat_times_, hidden_size)
+
+            self.weight[:, end - start].copy_(t_weight[start:end].to(self.data_type_))
+            # the padding part is zero
+            self.weight[:, end:].zero_()
diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/platform_op.py b/lightllm/common/basemodel/layer_weights/meta_weights/platform_op.py
new file mode 100644
index 000000000..127a543b2
--- /dev/null
+++ b/lightllm/common/basemodel/layer_weights/meta_weights/platform_op.py
@@ -0,0 +1,52 @@
+import torch
+from abc import ABC, abstractmethod
+from typing import Optional, Callable, Any
+from lightllm.utils.device_utils import get_platform, Platform
+from lightllm.utils.envs_utils import get_env_start_args
+
+
+class PlatformAwareOp(ABC):
+    """
+    platform aware op base class,
+    automatically route to the corresponding implementation method according to the platform.
+    """
+
+    def __init__(self):
+        args = get_env_start_args()
+        self.platform = get_platform(args.hardware_platform)
+        self.enable_torch_naive = args.enable_torch_naive
+        self._forward = self._route_forward()
+
+    def _route_forward(self) -> Callable:
+        method_name_map = {
+            Platform.CUDA: "_cuda_forward",
+            Platform.ASCEND: "_ascend_forward",
+            Platform.CAMBRICON: "_cambricon_forward",
+            Platform.MUSA: "_musa_forward",
+            Platform.ROCM: "_rocm_forward",
+            Platform.CPU: "_cpu_forward",
+        }
+
+        method_name = method_name_map.get(self.platform)
+        if method_name and hasattr(self, method_name):
+            method = getattr(self, method_name)
+            if callable(method):
+                return method
+
+        if self.enable_torch_naive:
+            return self._native_forward
+
+        # 如果都没有，抛出异常
+        raise NotImplementedError(
+            f"No implementation found for platform {self.platform.name}. "
+            f"Please implement _{self.platform.name}_forward method, "
+            f"or set --enable_torch_naive to use default implementation."
+        )
+
+    @abstractmethod
+    def _native_forward(self, *args, **kwargs) -> Any:
+        raise NotImplementedError("default forward must implement this method")
+
+    @abstractmethod
+    def _cuda_forward(self, *args, **kwargs) -> Any:
+        raise NotImplementedError("cuda forward must implement this method")
diff --git a/lightllm/server/api_cli.py b/lightllm/server/api_cli.py
index 44cc38822..ac883b1a4 100644
--- a/lightllm/server/api_cli.py
+++ b/lightllm/server/api_cli.py
@@ -608,4 +608,16 @@ def make_argument_parser() -> argparse.ArgumentParser:
         default=False,
         help="""Enable prefix prompt cache fetch for data parallel inference, disabled by default.""",
     )
+    parser.add_argument(
+        "--hardware_platform",
+        type=str,
+        default="cuda",
+        choices=["cuda", "musa"],
+        help="""Hardware platform: cuda | musa""",
+    )
+    parser.add_argument(
+        "--enable_torch_naive",
+        action="store_true",
+        help="""Use torch naive implementation for the op.""",
+    )
     return parser
diff --git a/lightllm/utils/device_utils.py b/lightllm/utils/device_utils.py
index 09d7a680f..a1ed6ed95 100644
--- a/lightllm/utils/device_utils.py
+++ b/lightllm/utils/device_utils.py
@@ -3,6 +3,8 @@
 import torch
 import shutil
 import subprocess
+from enum import Enum
+from typing import Optional
 from functools import lru_cache
 from lightllm.utils.log_utils import init_logger
 
@@ -284,3 +286,42 @@ def is_5090_gpu() -> bool:
             return False
     except:
         return False
+
+
+class Platform(Enum):
+    """hardware platform enum"""
+
+    CUDA = "cuda"
+    ASCEND = "ascend"  # ascend
+    CAMBRICON = "cambricon"  # cambricon
+    MUSA = "musa"  # musa
+    ROCM = "rocm"  # rocm
+    CPU = "cpu"  # cpu
+
+
+# 目前仅支持cuda 和 musa
+def get_platform(platform_name: Optional[str] = None) -> Platform:
+    """
+    get hardware platform.
+
+    Args:
+        platform_name: platform name (cuda, ascend, cambricon, musa, rocm, cpu)
+
+    Returns:
+        Platform: platform enum value
+    """
+    assert platform_name in ["cuda", "musa"], f"Only support cuda and musa now, but got {platform_name}"
+    platform_name = platform_name.lower()
+    platform_map = {
+        "cuda": Platform.CUDA,
+        "ascend": Platform.ASCEND,
+        "cambricon": Platform.CAMBRICON,
+        "musa": Platform.MUSA,
+        "rocm": Platform.ROCM,
+        "cpu": Platform.CPU,
+    }
+
+    platform = platform_map.get(platform_name)
+    if platform is None:
+        raise ValueError(f"Unknown platform name: {platform_name}")
+    return platform

From 2cd361ab6e6b88084f1970872b82997fb4e5de2a Mon Sep 17 00:00:00 2001
From: shihaobai <1798930569@qq.com>
Date: Mon, 12 Jan 2026 07:19:33 +0000
Subject: [PATCH 03/65] norm

---
 .../meta_weights/embedding_weight.py          | 31 +++++++++-------
 .../layer_weights/meta_weights/norm_weight.py |  1 +
 .../pre_and_post_layer_weight.py              |  6 ++--
 .../layer_weights/transformer_layer_weight.py | 28 ++++++++++-----
 .../pre_and_post_layer_weight.py              | 12 ++++---
 .../pre_and_post_layer_weight.py              |  7 ++--
 .../pre_and_post_layer_weight.py              |  6 ++--
 .../pre_and_post_layer_weight.py              |  7 ++--
 .../layer_weights/transformer_layer_weight.py | 17 ++++++---
 .../pre_and_post_layer_weight.py              | 13 +++----
 .../layer_weights/transformer_layer_weight.py | 10 ++++--
 .../pre_and_post_layer_weight.py              |  6 ++--
 .../layer_weights/transformer_layer_weight.py | 16 ++++++---
 .../pre_and_post_layer_weight.py              | 11 +++---
 .../layer_weights/transformer_layer_weight.py | 10 ++++--
 .../pre_and_post_layer_weight.py              |  6 ++--
 .../pre_and_post_layer_weight.py              |  7 ++--
 .../pre_and_post_layer_weight.py              |  6 ++--
 .../pre_and_post_layer_weight.py              |  7 ++--
 .../layer_weights/transformer_layer_weight.py | 36 ++++++++++++++-----
 20 files changed, 163 insertions(+), 80 deletions(-)

diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/embedding_weight.py b/lightllm/common/basemodel/layer_weights/meta_weights/embedding_weight.py
index fc018267f..e9b9176dd 100644
--- a/lightllm/common/basemodel/layer_weights/meta_weights/embedding_weight.py
+++ b/lightllm/common/basemodel/layer_weights/meta_weights/embedding_weight.py
@@ -4,38 +4,45 @@
 from .base_weight import BaseWeightTpl
 from lightllm.utils.dist_utils import get_current_device_id
 from lightllm.common.basemodel.triton_kernel.embedding import embedding as embedding_kernel
+from lightllm.utils.dist_utils import get_dp_world_size, get_current_rank_in_dp
 from lightllm.utils.log_utils import init_logger
 
 logger = init_logger(__name__)
 
 
 class EmbeddingWeight(BaseWeightTpl):
-    def __init__(self, weight_name, data_type):
+    def __init__(self, dim: int, vocab_size: int, weight_name: str, data_type: torch.dtype):
         super().__init__()
+        self.dim = dim
+        self.vocab_size = vocab_size
+        self.tp_world_size_ = get_dp_world_size()
+        self.tp_rank_ = get_current_rank_in_dp()
+        # 计算 split_indexes
+        split_indexes = np.linspace(0, self.vocab_size, self.tp_world_size_ + 1, dtype=np.int64)
+        self.tp_vocab_start_id = int(split_indexes[self.tp_rank_])
+        self.tp_vocab_end_id = int(split_indexes[self.tp_rank_ + 1])
         self.weight_name: str = weight_name
         self.data_type_ = data_type
         self.weight: torch.Tensor = None
 
+    def _create_weight(self):
+        tp_vocab_size = self.tp_vocab_end_id - self.tp_vocab_start_id
+        self.weight: torch.Tensor = torch.empty(tp_vocab_size, self.dim, dtype=self.data_type_, device=self.device_id_)
+
     def load_hf_weights(self, weights: Dict[str, torch.Tensor]):
         if self.weight_name not in weights or self.weight is not None:
             return
-
         t_weight = weights[self.weight_name]
         # init some params
-        self.vocab_size = len(t_weight)
-        split_indexes = np.linspace(0, self.vocab_size, self.tp_world_size_ + 1, dtype=np.int64)
-        self.tp_vocab_start_id = int(split_indexes[self.tp_rank_])
-        self.tp_vocab_end_id = int(split_indexes[self.tp_rank_ + 1])
-
+        loaded_vocab_size = len(t_weight)
+        assert (
+            loaded_vocab_size == self.vocab_size
+        ), f"loaded weight vocab_size: {loaded_vocab_size} != expected vocab_size: {self.vocab_size}"
         logger.info(f"loaded weight vocab_size: {self.vocab_size}")
-
-        self.weight = (
+        self.weight.copy_(
             t_weight[self.tp_vocab_start_id : self.tp_vocab_end_id, :].to(self.data_type_).cuda(get_current_device_id())
         )
 
-    def verify_load(self):
-        return self.weight is not None
-
     def embedding(self, input_ids: torch.Tensor, out: Optional[torch.Tensor] = None, alloc_func=torch.empty):
         if out is None:
             out = alloc_func(
diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/norm_weight.py b/lightllm/common/basemodel/layer_weights/meta_weights/norm_weight.py
index de13818a5..7b966600c 100644
--- a/lightllm/common/basemodel/layer_weights/meta_weights/norm_weight.py
+++ b/lightllm/common/basemodel/layer_weights/meta_weights/norm_weight.py
@@ -16,6 +16,7 @@ def __init__(self, dim: int, weight_name: str, data_type: torch.dtype, bias_name
         self.dim = dim
         self.weight_name = weight_name
         self.data_type_ = data_type
+        assert bias_name is None, "RMSNormWeight does not have bias"
         self._create_weight()
 
     def _create_weight(self):
diff --git a/lightllm/models/bloom/layer_weights/pre_and_post_layer_weight.py b/lightllm/models/bloom/layer_weights/pre_and_post_layer_weight.py
index 83f767453..e02af4b4e 100644
--- a/lightllm/models/bloom/layer_weights/pre_and_post_layer_weight.py
+++ b/lightllm/models/bloom/layer_weights/pre_and_post_layer_weight.py
@@ -1,18 +1,18 @@
 import torch
 import numpy as np
 from lightllm.common.basemodel import PreAndPostLayerWeight
-from lightllm.common.basemodel.layer_weights.meta_weights import EmbeddingWeight, NoTpNormWeight
+from lightllm.common.basemodel.layer_weights.meta_weights import EmbeddingWeight, LayerNormWeight
 
 
 class BloomPreAndPostLayerWeight(PreAndPostLayerWeight):
     def __init__(self, data_type, network_config):
         super().__init__(data_type, network_config)
-        self.pre_norm_weight_ = NoTpNormWeight(
+        self.pre_norm_weight_ = LayerNormWeight(
             weight_name="word_embeddings_layernorm.weight",
             data_type=self.data_type_,
             bias_name="word_embeddings_layernorm.bias",
         )
-        self.final_norm_weight_ = NoTpNormWeight(
+        self.final_norm_weight_ = LayerNormWeight(
             weight_name="ln_f.weight",
             data_type=self.data_type_,
             bias_name="ln_f.bias",
diff --git a/lightllm/models/deepseek2/layer_weights/transformer_layer_weight.py b/lightllm/models/deepseek2/layer_weights/transformer_layer_weight.py
index c5a2d3352..65e00ebe7 100644
--- a/lightllm/models/deepseek2/layer_weights/transformer_layer_weight.py
+++ b/lightllm/models/deepseek2/layer_weights/transformer_layer_weight.py
@@ -7,7 +7,7 @@
 from lightllm.common.basemodel.layer_weights.meta_weights import (
     ROWMMWeight,
     COLMMWeight,
-    NoTpNormWeight,
+    RMSNormWeight,
     FusedMoeWeightEP,
     ROWBMMWeight,
     create_tp_moe_wegiht_obj,
@@ -299,16 +299,26 @@ def _init_ffn(self):
         self._load_mlp(f"model.layers.{self.layer_num_}.mlp")
 
     def _init_norm(self):
-        self.att_norm_weight_ = NoTpNormWeight(
-            f"model.layers.{self.layer_num_}.input_layernorm.weight", self.data_type_
+        hidden_size = self.network_config_["hidden_size"]
+
+        self.att_norm_weight_ = RMSNormWeight(
+            dim=hidden_size,
+            weight_name=f"model.layers.{self.layer_num_}.input_layernorm.weight",
+            data_type=self.data_type_,
         )
-        self.ffn_norm_weight_ = NoTpNormWeight(
-            f"model.layers.{self.layer_num_}.post_attention_layernorm.weight", self.data_type_
+        self.ffn_norm_weight_ = RMSNormWeight(
+            dim=hidden_size,
+            weight_name=f"model.layers.{self.layer_num_}.post_attention_layernorm.weight",
+            data_type=self.data_type_,
         )
-        self.kv_a_layernorm_ = NoTpNormWeight(
-            f"model.layers.{self.layer_num_}.self_attn.kv_a_layernorm.weight", self.data_type_
+        self.kv_a_layernorm_ = RMSNormWeight(
+            dim=self.kv_lora_rank + self.qk_rope_head_dim,
+            weight_name=f"model.layers.{self.layer_num_}.self_attn.kv_a_layernorm.weight",
+            data_type=self.data_type_,
         )
         if self.q_lora_rank is not None:
-            self.q_a_layernorm_ = NoTpNormWeight(
-                f"model.layers.{self.layer_num_}.self_attn.q_a_layernorm.weight", self.data_type_
+            self.q_a_layernorm_ = RMSNormWeight(
+                dim=self.q_lora_rank,
+                weight_name=f"model.layers.{self.layer_num_}.self_attn.q_a_layernorm.weight",
+                data_type=self.data_type_,
             )
diff --git a/lightllm/models/deepseek_mtp/layer_weights/pre_and_post_layer_weight.py b/lightllm/models/deepseek_mtp/layer_weights/pre_and_post_layer_weight.py
index 1f0815c3d..719c80c27 100644
--- a/lightllm/models/deepseek_mtp/layer_weights/pre_and_post_layer_weight.py
+++ b/lightllm/models/deepseek_mtp/layer_weights/pre_and_post_layer_weight.py
@@ -2,7 +2,7 @@
 from lightllm.common.basemodel.layer_weights.meta_weights import (
     EmbeddingWeight,
     LMHeadWeight,
-    NoTpNormWeight,
+    RMSNormWeight,
     ROWMMWeight,
 )
 
@@ -11,6 +11,7 @@ class Deepseek3MTPPreAndPostLayerWeight(PreAndPostLayerWeight):
     def __init__(self, data_type, network_config):
         super().__init__(data_type, network_config)
 
+        hidden_size = network_config["hidden_size"]
         self.eh_proj_weight_ = ROWMMWeight(
             weight_names="model.layers.0.eh_proj.weight",
             data_type=self.data_type_,
@@ -18,17 +19,20 @@ def __init__(self, data_type, network_config):
             tp_rank=0,
             tp_world_size=1,
         )
-        self.enorm_weight_ = NoTpNormWeight(
+        self.enorm_weight_ = RMSNormWeight(
+            dim=hidden_size,
             weight_name="model.layers.0.enorm.weight",
             data_type=self.data_type_,
             bias_name=None,
         )
-        self.hnorm_weight_ = NoTpNormWeight(
+        self.hnorm_weight_ = RMSNormWeight(
+            dim=hidden_size,
             weight_name="model.layers.0.hnorm.weight",
             data_type=self.data_type_,
             bias_name=None,
         )
-        self.final_norm_weight_ = NoTpNormWeight(
+        self.final_norm_weight_ = RMSNormWeight(
+            dim=hidden_size,
             weight_name="model.layers.0.shared_head.norm.weight",
             data_type=self.data_type_,
             bias_name=None,
diff --git a/lightllm/models/internlm2/layer_weights/pre_and_post_layer_weight.py b/lightllm/models/internlm2/layer_weights/pre_and_post_layer_weight.py
index 3ed7004c1..7419d35e9 100644
--- a/lightllm/models/internlm2/layer_weights/pre_and_post_layer_weight.py
+++ b/lightllm/models/internlm2/layer_weights/pre_and_post_layer_weight.py
@@ -1,5 +1,5 @@
 from lightllm.common.basemodel import PreAndPostLayerWeight
-from lightllm.common.basemodel.layer_weights.meta_weights import EmbeddingWeight, LMHeadWeight, NoTpNormWeight
+from lightllm.common.basemodel.layer_weights.meta_weights import EmbeddingWeight, LMHeadWeight, RMSNormWeight
 
 
 class Internlm2PreAndPostLayerWeight(PreAndPostLayerWeight):
@@ -7,8 +7,9 @@ def __init__(self, data_type, network_config):
         super().__init__(data_type, network_config)
         self.wte_weight_ = EmbeddingWeight(weight_name="model.tok_embeddings.weight", data_type=self.data_type_)
         self.lm_head_weight_ = LMHeadWeight(weight_name="output.weight", data_type=self.data_type_)
-
-        self.final_norm_weight_ = NoTpNormWeight(
+        hidden_size = network_config["hidden_size"]
+        self.final_norm_weight_ = RMSNormWeight(
+            dim=hidden_size,
             weight_name="model.norm.weight",
             data_type=self.data_type_,
         )
diff --git a/lightllm/models/internlm2_reward/layer_weights/pre_and_post_layer_weight.py b/lightllm/models/internlm2_reward/layer_weights/pre_and_post_layer_weight.py
index 59caf40d6..caef47399 100644
--- a/lightllm/models/internlm2_reward/layer_weights/pre_and_post_layer_weight.py
+++ b/lightllm/models/internlm2_reward/layer_weights/pre_and_post_layer_weight.py
@@ -1,11 +1,12 @@
 import numpy as np
 from lightllm.common.basemodel import PreAndPostLayerWeight
-from lightllm.common.basemodel.layer_weights.meta_weights import EmbeddingWeight, NoTpNormWeight, ROWMMWeight
+from lightllm.common.basemodel.layer_weights.meta_weights import EmbeddingWeight, RMSNormWeight, ROWMMWeight
 
 
 class Internlm2RewardPreAndPostLayerWeight(PreAndPostLayerWeight):
     def __init__(self, data_type, network_config):
         super().__init__(data_type, network_config)
+        hidden_size = network_config["hidden_size"]
         self.wte_weight_ = EmbeddingWeight(
             weight_name="model.tok_embeddings.weight",
             data_type=self.data_type_,
@@ -17,7 +18,8 @@ def __init__(self, data_type, network_config):
             tp_rank=0,
             tp_world_size=1,
         )
-        self.final_norm_weight_ = NoTpNormWeight(
+        self.final_norm_weight_ = RMSNormWeight(
+            dim=hidden_size,
             weight_name="model.norm.weight",
             data_type=self.data_type_,
         )
diff --git a/lightllm/models/llama/layer_weights/pre_and_post_layer_weight.py b/lightllm/models/llama/layer_weights/pre_and_post_layer_weight.py
index 7e9ff4167..82c1f3aa2 100644
--- a/lightllm/models/llama/layer_weights/pre_and_post_layer_weight.py
+++ b/lightllm/models/llama/layer_weights/pre_and_post_layer_weight.py
@@ -1,11 +1,12 @@
 from lightllm.common.basemodel import PreAndPostLayerWeight
-from lightllm.common.basemodel.layer_weights.meta_weights import EmbeddingWeight, LMHeadWeight, NoTpNormWeight
+from lightllm.common.basemodel.layer_weights.meta_weights import EmbeddingWeight, LMHeadWeight, RMSNormWeight
 
 
 class LlamaPreAndPostLayerWeight(PreAndPostLayerWeight):
     def __init__(self, data_type, network_config):
         super().__init__(data_type, network_config)
 
+        hidden_size = network_config["hidden_size"]
         self.wte_weight_ = EmbeddingWeight(
             weight_name="model.embed_tokens.weight",
             data_type=self.data_type_,
@@ -19,9 +20,9 @@ def __init__(self, data_type, network_config):
                 data_type=self.data_type_,
             )
 
-        self.final_norm_weight_ = NoTpNormWeight(
+        self.final_norm_weight_ = RMSNormWeight(
+            dim=hidden_size,
             weight_name="model.norm.weight",
             data_type=self.data_type_,
-            bias_name=None,
         )
         return
diff --git a/lightllm/models/llama/layer_weights/transformer_layer_weight.py b/lightllm/models/llama/layer_weights/transformer_layer_weight.py
index a455a01f9..426230e14 100644
--- a/lightllm/models/llama/layer_weights/transformer_layer_weight.py
+++ b/lightllm/models/llama/layer_weights/transformer_layer_weight.py
@@ -2,7 +2,7 @@
 import math
 import numpy as np
 from lightllm.common.basemodel import TransformerLayerWeight
-from lightllm.common.basemodel.layer_weights.meta_weights import ROWMMWeight, COLMMWeight, NoTpNormWeight
+from lightllm.common.basemodel.layer_weights.meta_weights import ROWMMWeight, COLMMWeight, RMSNormWeight
 
 
 class LlamaTransformerLayerWeight(TransformerLayerWeight):
@@ -115,9 +115,16 @@ def _init_ffn(self):
         )
 
     def _init_norm(self):
-        self.att_norm_weight_ = NoTpNormWeight(
-            self._att_norm_weight_name, self.data_type_, bias_name=self._att_norm_bias_name
+        hidden_size = self.network_config_["hidden_size"]
+        self.att_norm_weight_ = RMSNormWeight(
+            dim=hidden_size,
+            weight_name=self._att_norm_weight_name,
+            data_type=self.data_type_,
+            bias_name=self._att_norm_bias_name,
         )
-        self.ffn_norm_weight_ = NoTpNormWeight(
-            self._ffn_norm_weight_name, self.data_type_, bias_name=self._ffn_norm_bias_name
+        self.ffn_norm_weight_ = RMSNormWeight(
+            dim=hidden_size,
+            weight_name=self._ffn_norm_weight_name,
+            data_type=self.data_type_,
+            bias_name=self._ffn_norm_bias_name,
         )
diff --git a/lightllm/models/mistral_mtp/layer_weights/pre_and_post_layer_weight.py b/lightllm/models/mistral_mtp/layer_weights/pre_and_post_layer_weight.py
index c9032f6fe..a65250b16 100644
--- a/lightllm/models/mistral_mtp/layer_weights/pre_and_post_layer_weight.py
+++ b/lightllm/models/mistral_mtp/layer_weights/pre_and_post_layer_weight.py
@@ -2,7 +2,7 @@
 from lightllm.common.basemodel.layer_weights.meta_weights import (
     EmbeddingWeight,
     LMHeadWeight,
-    NoTpNormWeight,
+    RMSNormWeight,
     ROWMMWeight,
 )
 
@@ -10,7 +10,7 @@
 class MistralMTPPreAndPostLayerWeight(PreAndPostLayerWeight):
     def __init__(self, data_type, network_config):
         super().__init__(data_type, network_config)
-
+        hidden_size = network_config["hidden_size"]
         self.eh_proj_weight_ = ROWMMWeight(
             weight_names="mtp.eh_proj.weight",
             data_type=self.data_type_,
@@ -19,12 +19,13 @@ def __init__(self, data_type, network_config):
             tp_rank=0,
             tp_world_size=1,
         )
-        self.enorm_weight_ = NoTpNormWeight(
+        self.enorm_weight_ = RMSNormWeight(
+            dim=hidden_size,
             weight_name="mtp.enorm.weight",
             data_type=self.data_type_,
-            bias_name=None,
         )
-        self.hnorm_weight_ = NoTpNormWeight(
+        self.hnorm_weight_ = RMSNormWeight(
+            dim=hidden_size,
             weight_name="mtp.hnorm.weight",
             data_type=self.data_type_,
             bias_name=None,
@@ -32,5 +33,5 @@ def __init__(self, data_type, network_config):
 
         self.wte_weight_: EmbeddingWeight = None
         self.lm_head_weight_: LMHeadWeight = None
-        self.final_norm_weight_: NoTpNormWeight = None
+        self.final_norm_weight_: RMSNormWeight = None
         return
diff --git a/lightllm/models/mistral_mtp/layer_weights/transformer_layer_weight.py b/lightllm/models/mistral_mtp/layer_weights/transformer_layer_weight.py
index 08f280b06..b58e58799 100644
--- a/lightllm/models/mistral_mtp/layer_weights/transformer_layer_weight.py
+++ b/lightllm/models/mistral_mtp/layer_weights/transformer_layer_weight.py
@@ -1,5 +1,5 @@
 from lightllm.common.basemodel import TransformerLayerWeight
-from lightllm.common.basemodel.layer_weights.meta_weights import ROWMMWeight, COLMMWeight, NoTpNormWeight
+from lightllm.common.basemodel.layer_weights.meta_weights import ROWMMWeight, COLMMWeight, RMSNormWeight
 
 
 class MistralMTPTransformerLayerWeight(TransformerLayerWeight):
@@ -41,6 +41,10 @@ def _init_ffn(self):
         )
 
     def _init_norm(self):
-        self.ffn_norm_weight_ = NoTpNormWeight(
-            self._ffn_norm_weight_name, self.data_type_, bias_name=self._ffn_norm_bias_name
+        hidden_size = self.network_config_["hidden_size"]
+        self.ffn_norm_weight_ = RMSNormWeight(
+            dim=hidden_size,
+            weight_name=self._ffn_norm_weight_name,
+            data_type=self.data_type_,
+            bias_name=self._ffn_norm_bias_name,
         )
diff --git a/lightllm/models/qwen/layer_weights/pre_and_post_layer_weight.py b/lightllm/models/qwen/layer_weights/pre_and_post_layer_weight.py
index bf9282a97..c35f5c78c 100644
--- a/lightllm/models/qwen/layer_weights/pre_and_post_layer_weight.py
+++ b/lightllm/models/qwen/layer_weights/pre_and_post_layer_weight.py
@@ -1,12 +1,13 @@
 import torch
 import numpy as np
 from lightllm.common.basemodel import PreAndPostLayerWeight
-from lightllm.common.basemodel.layer_weights.meta_weights import EmbeddingWeight, LMHeadWeight, NoTpNormWeight
+from lightllm.common.basemodel.layer_weights.meta_weights import EmbeddingWeight, LMHeadWeight, RMSNormWeight
 
 
 class QwenPreAndPostLayerWeight(PreAndPostLayerWeight):
     def __init__(self, data_type, network_config):
         super().__init__(data_type, network_config)
+        hidden_size = network_config["hidden_size"]
         self.wte_weight_ = EmbeddingWeight(
             weight_name="transformer.wte.weight",
             data_type=self.data_type_,
@@ -15,7 +16,8 @@ def __init__(self, data_type, network_config):
             weight_name="lm_head.weight",
             data_type=self.data_type_,
         )
-        self.final_norm_weight_ = NoTpNormWeight(
+        self.final_norm_weight_ = RMSNormWeight(
+            dim=hidden_size,
             weight_name="transformer.ln_f.weight",
             data_type=self.data_type_,
         )
diff --git a/lightllm/models/qwen3/layer_weights/transformer_layer_weight.py b/lightllm/models/qwen3/layer_weights/transformer_layer_weight.py
index 90b7810ad..cbf420f50 100644
--- a/lightllm/models/qwen3/layer_weights/transformer_layer_weight.py
+++ b/lightllm/models/qwen3/layer_weights/transformer_layer_weight.py
@@ -1,6 +1,6 @@
 from lightllm.models.qwen2.layer_weights.transformer_layer_weight import Qwen2TransformerLayerWeight
 from lightllm.common.basemodel.layer_weights.meta_weights import (
-    NoTpNormWeight,
+    RMSNormWeight,
 )
 
 
@@ -19,6 +19,14 @@ def _init_weight_names(self):
 
     def _init_norm(self):
         super()._init_norm()
-
-        self.q_norm_weight_ = NoTpNormWeight(weight_name=self._q_norm_name, data_type=self.data_type_)
-        self.k_norm_weight_ = NoTpNormWeight(weight_name=self._k_norm_name, data_type=self.data_type_)
+        hidden_size = self.network_config_["hidden_size"]
+        self.q_norm_weight_ = RMSNormWeight(
+            dim=hidden_size,
+            weight_name=self._q_norm_name,
+            data_type=self.data_type_,
+        )
+        self.k_norm_weight_ = RMSNormWeight(
+            dim=hidden_size,
+            weight_name=self._k_norm_name,
+            data_type=self.data_type_,
+        )
diff --git a/lightllm/models/qwen3_moe_mtp/layer_weights/pre_and_post_layer_weight.py b/lightllm/models/qwen3_moe_mtp/layer_weights/pre_and_post_layer_weight.py
index 8ba95c138..e3a557d55 100644
--- a/lightllm/models/qwen3_moe_mtp/layer_weights/pre_and_post_layer_weight.py
+++ b/lightllm/models/qwen3_moe_mtp/layer_weights/pre_and_post_layer_weight.py
@@ -4,7 +4,7 @@
     EmbeddingWeight,
     ROWMMWeight,
     LMHeadWeight,
-    NoTpNormWeight,
+    RMSNormWeight,
 )
 
 
@@ -12,6 +12,7 @@ class Qwen3MOEMTPPreAndPostLayerWeight(PreAndPostLayerWeight):
     def __init__(self, data_type, network_config):
         super().__init__(data_type, network_config)
 
+        hidden_size = network_config["hidden_size"]
         self.eh_proj_weight_ = ROWMMWeight(
             weight_names="model.layers.0.proj.weight",
             data_type=self.data_type_,
@@ -19,12 +20,14 @@ def __init__(self, data_type, network_config):
             tp_rank=0,
             tp_world_size=1,
         )
-        self.enorm_weight_ = NoTpNormWeight(
+        self.enorm_weight_ = RMSNormWeight(
+            dim=hidden_size,
             weight_name="model.layers.0.norm_after_embedding.weight",
             data_type=self.data_type_,
             bias_name=None,
         )
-        self.hnorm_weight_ = NoTpNormWeight(
+        self.hnorm_weight_ = RMSNormWeight(
+            dim=hidden_size,
             weight_name="model.layers.0.norm_before_output.weight",
             data_type=self.data_type_,
             bias_name=None,
@@ -32,5 +35,5 @@ def __init__(self, data_type, network_config):
         # 与Qwen3MOE模型共享
         self.wte_weight_: EmbeddingWeight = None
         self.lm_head_weight_: LMHeadWeight = None
-        self.final_norm_weight_: NoTpNormWeight = None
+        self.final_norm_weight_: RMSNormWeight = None
         return
diff --git a/lightllm/models/qwen3_moe_mtp/layer_weights/transformer_layer_weight.py b/lightllm/models/qwen3_moe_mtp/layer_weights/transformer_layer_weight.py
index 095afecd9..2a11724ce 100644
--- a/lightllm/models/qwen3_moe_mtp/layer_weights/transformer_layer_weight.py
+++ b/lightllm/models/qwen3_moe_mtp/layer_weights/transformer_layer_weight.py
@@ -1,6 +1,6 @@
 import os
 from lightllm.models.qwen3_moe.layer_weights.transformer_layer_weight import Qwen3MOETransformerLayerWeight
-from lightllm.common.basemodel.layer_weights.meta_weights import NoTpNormWeight
+from lightllm.common.basemodel.layer_weights.meta_weights import RMSNormWeight
 
 
 class Qwen3MOEMTPTransformerLayerWeight(Qwen3MOETransformerLayerWeight):
@@ -16,6 +16,10 @@ def _init_weight(self):
             self._init_ffn()
 
     def _init_norm(self):
-        self.ffn_norm_weight_ = NoTpNormWeight(
-            self._ffn_norm_weight_name, self.data_type_, bias_name=self._ffn_norm_bias_name
+        hidden_size = self.network_config_["hidden_size"]
+        self.ffn_norm_weight_ = RMSNormWeight(
+            dim=hidden_size,
+            weight_name=self._ffn_norm_weight_name,
+            data_type=self.data_type_,
+            bias_name=self._ffn_norm_bias_name,
         )
diff --git a/lightllm/models/qwen3_vl_moe/layer_weights/pre_and_post_layer_weight.py b/lightllm/models/qwen3_vl_moe/layer_weights/pre_and_post_layer_weight.py
index 52a982f49..43758731b 100644
--- a/lightllm/models/qwen3_vl_moe/layer_weights/pre_and_post_layer_weight.py
+++ b/lightllm/models/qwen3_vl_moe/layer_weights/pre_and_post_layer_weight.py
@@ -1,11 +1,12 @@
 import numpy as np
 from lightllm.common.basemodel import PreAndPostLayerWeight
-from lightllm.common.basemodel.layer_weights.meta_weights import EmbeddingWeight, LMHeadWeight, NoTpNormWeight
+from lightllm.common.basemodel.layer_weights.meta_weights import EmbeddingWeight, LMHeadWeight, RMSNormWeight
 
 
 class Qwen3VLMOEPreAndPostLayerWeight(PreAndPostLayerWeight):
     def __init__(self, data_type, network_config):
         super().__init__(data_type, network_config)
+        hidden_size = network_config["hidden_size"]
         self.wte_weight_ = EmbeddingWeight(
             weight_name="model.language_model.embed_tokens.weight",
             data_type=self.data_type_,
@@ -18,7 +19,8 @@ def __init__(self, data_type, network_config):
                 weight_name="lm_head.weight",
                 data_type=self.data_type_,
             )
-        self.final_norm_weight_ = NoTpNormWeight(
+        self.final_norm_weight_ = RMSNormWeight(
+            dim=hidden_size,
             weight_name="model.language_model.norm.weight",
             data_type=self.data_type_,
         )
diff --git a/lightllm/models/stablelm/layer_weights/pre_and_post_layer_weight.py b/lightllm/models/stablelm/layer_weights/pre_and_post_layer_weight.py
index 3d044eeb5..885c7ead7 100755
--- a/lightllm/models/stablelm/layer_weights/pre_and_post_layer_weight.py
+++ b/lightllm/models/stablelm/layer_weights/pre_and_post_layer_weight.py
@@ -1,10 +1,13 @@
-from lightllm.models.llama.layer_weights.pre_and_post_layer_weight import LlamaPreAndPostLayerWeight, NoTpNormWeight
+from lightllm.models.llama.layer_weights.pre_and_post_layer_weight import LlamaPreAndPostLayerWeight
+from lightllm.common.basemodel.layer_weights.meta_weights import LayerNormWeight
 
 
 class StableLMPreAndPostLayerWeight(LlamaPreAndPostLayerWeight):
     def __init__(self, data_type, network_config):
         super().__init__(data_type, network_config)
-        self.final_norm_weight_ = NoTpNormWeight(
+        hidden_size = network_config["hidden_size"]
+        self.final_norm_weight_ = LayerNormWeight(
+            dim=hidden_size,
             weight_name="model.norm.weight",
             data_type=self.data_type_,
             bias_name="model.norm.bias",
diff --git a/lightllm/models/starcoder/layer_weights/pre_and_post_layer_weight.py b/lightllm/models/starcoder/layer_weights/pre_and_post_layer_weight.py
index 34e74f136..939c6a146 100644
--- a/lightllm/models/starcoder/layer_weights/pre_and_post_layer_weight.py
+++ b/lightllm/models/starcoder/layer_weights/pre_and_post_layer_weight.py
@@ -1,7 +1,7 @@
 from lightllm.common.basemodel import PreAndPostLayerWeight
 from lightllm.common.basemodel.layer_weights.meta_weights import (
     EmbeddingWeight,
-    NoTpNormWeight,
+    LayerNormWeight,
     NoTpPosEmbeddingWeight,
     LMHeadWeight,
 )
@@ -12,6 +12,7 @@ def __init__(self, data_type, network_config):
         super().__init__(data_type, network_config)
 
     def _create_weight(self):
+        hidden_size = self.network_config["hidden_size"]
         self.wte_weight_ = EmbeddingWeight(
             weight_name="transformer.wte.weight",
             data_type=self.data_type_,
@@ -21,7 +22,8 @@ def _create_weight(self):
             data_type=self.data_type_,
         )
 
-        self.final_norm_weight_ = NoTpNormWeight(
+        self.final_norm_weight_ = LayerNormWeight(
+            dim=hidden_size,
             weight_name="transformer.ln_f.weight",
             bias_name="transformer.ln_f.bias",
             data_type=self.data_type_,
diff --git a/lightllm/models/starcoder2/layer_weights/pre_and_post_layer_weight.py b/lightllm/models/starcoder2/layer_weights/pre_and_post_layer_weight.py
index d08c27cc7..7890f82dc 100644
--- a/lightllm/models/starcoder2/layer_weights/pre_and_post_layer_weight.py
+++ b/lightllm/models/starcoder2/layer_weights/pre_and_post_layer_weight.py
@@ -1,13 +1,13 @@
 import torch
 import numpy as np
 from lightllm.common.basemodel import PreAndPostLayerWeight
-from lightllm.common.basemodel.layer_weights.meta_weights import EmbeddingWeight, LMHeadWeight, NoTpNormWeight
+from lightllm.common.basemodel.layer_weights.meta_weights import EmbeddingWeight, LMHeadWeight, LayerNormWeight
 
 
 class Starcoder2PreAndPostLayerWeight(PreAndPostLayerWeight):
     def __init__(self, data_type, network_config):
         super().__init__(data_type, network_config)
-
+        hidden_size = network_config["hidden_size"]
         self.wte_weight_ = EmbeddingWeight(
             weight_name="model.embed_tokens.weight",
             data_type=self.data_type_,
@@ -21,7 +21,8 @@ def __init__(self, data_type, network_config):
                 data_type=self.data_type_,
             )
 
-        self.final_norm_weight_ = NoTpNormWeight(
+        self.final_norm_weight_ = LayerNormWeight(
+            dim=hidden_size,
             weight_name="model.norm.weight",
             data_type=self.data_type_,
             bias_name="model.norm.bias",
diff --git a/lightllm/models/vit/layer_weights/transformer_layer_weight.py b/lightllm/models/vit/layer_weights/transformer_layer_weight.py
index dffcc16fe..5a7a24a9a 100644
--- a/lightllm/models/vit/layer_weights/transformer_layer_weight.py
+++ b/lightllm/models/vit/layer_weights/transformer_layer_weight.py
@@ -1,4 +1,5 @@
 import os
+from turtle import TPen
 import torch
 import math
 import numpy as np
@@ -7,8 +8,9 @@
 from lightllm.common.basemodel.layer_weights.meta_weights import (
     ROWMMWeight,
     COLMMWeight,
-    NoTpNormWeight,
-    TpVitPadNormWeight,
+    RMSNormWeight,
+    LayerNormWeight,
+    TpRMSNormWeight,
 )
 from lightllm.utils.dist_utils import get_current_device_id
 
@@ -119,16 +121,34 @@ def _init_ffn(self):
         )
 
     def _init_norm(self):
-        self.att_norm_weight_ = NoTpNormWeight(
-            self._att_norm_weight_name, self.data_type_, bias_name=self._att_norm_bias_name
+        norm_weight_cls = RMSNormWeight if self.norm_type == "rms_norm" else LayerNormWeight
+        hidden_size = self.network_config_["hidden_size"]
+        self.att_norm_weight_ = norm_weight_cls(
+            dim=hidden_size,
+            weight_name=self._att_norm_weight_name,
+            data_type=self.data_type_,
+            bias_name=self._att_norm_bias_name,
         )
-        self.ffn_norm_weight_ = NoTpNormWeight(
-            self._ffn_norm_weight_name, self.data_type_, bias_name=self._ffn_norm_bias_name
+        self.ffn_norm_weight_ = norm_weight_cls(
+            dim=hidden_size,
+            weight_name=self._ffn_norm_weight_name,
+            data_type=self.data_type_,
+            bias_name=self._ffn_norm_bias_name,
         )
         if self.qk_norm:
             head_num = self.network_config_["num_attention_heads"]
-            self.q_norm_weight_ = TpVitPadNormWeight(self._q_norm_weight_name, self.data_type_, head_num=head_num)
-            self.k_norm_weight_ = TpVitPadNormWeight(self._k_norm_weight_name, self.data_type_, head_num=head_num)
+            self.q_norm_weight_ = TpRMSNormWeight(
+                dim=hidden_size,
+                weight_name=self._q_norm_weight_name,
+                data_type=self.data_type_,
+                head_num=head_num,
+            )
+            self.k_norm_weight_ = TpRMSNormWeight(
+                dim=hidden_size,
+                weight_name=self._k_norm_weight_name,
+                data_type=self.data_type_,
+                head_num=head_num,
+            )
 
     def load_hf_weights(self, weights):
         if f"vision_model.encoder.layers.{self.layer_num_}.attn.qkv.weight" in weights:

From efcaa4eeaef8afa6e388f7990db9c0f2062ad91e Mon Sep 17 00:00:00 2001
From: shihaobai <1798930569@qq.com>
Date: Mon, 12 Jan 2026 10:05:58 +0000
Subject: [PATCH 04/65] mm weight refactor

---
 .../layer_weights/meta_weights/__init__.py    |   2 +-
 .../meta_weights/mm_weight/__init__.py        |   2 +-
 .../meta_weights/mm_weight/colmm_weight.py    |  10 +-
 .../meta_weights/mm_weight/mm_slicer.py       | 129 ++++++++++--------
 .../meta_weights/mm_weight/mm_weight.py       |  46 ++-----
 .../meta_weights/mm_weight/rowmm_weight.py    |  53 +++++--
 .../layer_weights/transformer_layer_weight.py |  24 ++--
 .../layer_weights/transformer_layer_weight.py |  30 ----
 8 files changed, 149 insertions(+), 147 deletions(-)

diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/__init__.py b/lightllm/common/basemodel/layer_weights/meta_weights/__init__.py
index 109777401..cbf399843 100644
--- a/lightllm/common/basemodel/layer_weights/meta_weights/__init__.py
+++ b/lightllm/common/basemodel/layer_weights/meta_weights/__init__.py
@@ -2,8 +2,8 @@
 from .mm_weight import (
     MMWeightTpl,
     ROWMMWeight,
+    KVROWNMMWeight,
     COLMMWeight,
-    ROWBMMWeight,
 )
 from .norm_weight import TpRMSNormWeight, RMSNormWeight, LayerNormWeight
 from .embedding_weight import EmbeddingWeight, LMHeadWeight, NoTpPosEmbeddingWeight
diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/__init__.py b/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/__init__.py
index 34d989b01..ae0c65197 100644
--- a/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/__init__.py
+++ b/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/__init__.py
@@ -1,5 +1,5 @@
 from .mm_weight import (
     MMWeightTpl,
 )
-from .rowmm_weight import ROWMMWeight, ROWBMMWeight
+from .rowmm_weight import ROWMMWeight, KVROWNMMWeight
 from .colmm_weight import COLMMWeight
diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/colmm_weight.py b/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/colmm_weight.py
index bf73b9ad8..1a02e00d0 100644
--- a/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/colmm_weight.py
+++ b/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/colmm_weight.py
@@ -6,6 +6,7 @@
 from lightllm.utils.dist_utils import get_current_device_id
 from lightllm.common.quantization.quantize_method import QuantizationMethod
 from typing import Dict, List, Optional, Union
+from lightllm.utils.dist_utils import get_current_rank_in_dp, get_dp_world_size
 from .mm_slicer import get_col_slice_mixin
 
 
@@ -21,6 +22,9 @@ def __init__(
         tp_rank: int = None,
         tp_world_size: int = None,
     ) -> None:
+        self.tp_rank_ = tp_rank if tp_rank is not None else get_current_rank_in_dp()
+        self.tp_world_size_ = tp_world_size if tp_world_size is not None else get_dp_world_size()
+        in_dim = self._get_tp_dim(in_dim)
         super().__init__(
             in_dim=in_dim,
             out_dims=out_dims,
@@ -28,9 +32,9 @@ def __init__(
             data_type=data_type,
             bias_names=bias_names,
             quant_method=quant_method,
-            tp_rank=tp_rank,
-            tp_world_size=tp_world_size,
+            tp_rank=self.tp_rank_,
+            tp_world_size=self.tp_world_size_,
         )
         self.param_slicer = get_col_slice_mixin(
-            self.quant_method.method_name, tp_rank=tp_rank, tp_world_size=tp_world_size
+            self.quant_method.method_name, tp_rank=self.tp_rank_, tp_world_size=self.tp_world_size_
         )
diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/mm_slicer.py b/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/mm_slicer.py
index e2830ab61..4bc3b44a8 100644
--- a/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/mm_slicer.py
+++ b/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/mm_slicer.py
@@ -1,5 +1,5 @@
 import torch
-from typing import Optional
+from typing import Optional, Tuple
 from abc import ABC, abstractmethod
 from lightllm.utils.dist_utils import get_current_rank_in_dp, get_dp_world_size
 
@@ -7,9 +7,12 @@
 class SliceMixinBase(ABC):
     """切片操作的Mixin基类"""
 
-    def __init__(self, tp_rank: int = None, tp_world_size: int = None):
+    def __init__(self, tp_rank: int = None, tp_world_size: int = None, repeat_times: int = 1):
         self.tp_rank_ = tp_rank if tp_rank is not None else get_current_rank_in_dp()
         self.tp_world_size_ = tp_world_size if tp_world_size is not None else get_dp_world_size()
+        # this param is used to slice the weight when tp_world_size_ is divisible by the kv_head_num
+        # for example, if tp_world_size_ is 8 and kv_head_num is 4, then repeat_times_ is 2
+        self.repeat_times_ = repeat_times
 
     @abstractmethod
     def _slice_weight(self, weight: torch.Tensor):
@@ -19,10 +22,16 @@ def _slice_weight(self, weight: torch.Tensor):
     def _slice_bias(self, bias):
         pass
 
+    def _get_slice_start_end(self, size: int) -> Tuple[int, int]:
+        tp_size = size * self.repeat_times_ // self.tp_world_size_
+        start = tp_size * (self.tp_rank_ % self.repeat_times_)
+        end = start + tp_size
+        return start, end
+
 
 class SliceMixinTpl(SliceMixinBase):
-    def __init__(self, tp_rank: int = None, tp_world_size: int = None):
-        super().__init__(tp_rank, tp_world_size)
+    def __init__(self, tp_rank: int = None, tp_world_size: int = None, repeat_times: int = 1):
+        super().__init__(tp_rank, tp_world_size, repeat_times)
 
     def _slice_weight(self, weight: torch.Tensor) -> torch.Tensor:
         raise NotImplementedError("slice_weight must implement this method")
@@ -40,113 +49,117 @@ def _slice_weight_zero_point(self, weight_zero_point: torch.Tensor) -> torch.Ten
 # 默认weight 的shape是 outxin，这也是目前最通用的约定。
 # 所以row-wise是沿着dim=0进行切分，col-wise是沿着dim=1进行切分。
 class RowSliceMixin(SliceMixinTpl):
-    def __init__(self, tp_rank: int = None, tp_world_size: int = None):
-        super().__init__(tp_rank, tp_world_size)
+    def __init__(self, tp_rank: int = None, tp_world_size: int = None, repeat_times: int = 1):
+        super().__init__(tp_rank, tp_world_size, repeat_times)
 
     def _slice_weight(self, weight: torch.Tensor) -> torch.Tensor:
-        assert weight.shape[0] % self.tp_world_size_ == 0, f"tp slice error {weight.shape[0]} % {self.tp_world_size_}"
-        tp_size = weight.shape[0] // self.tp_world_size_
-        return weight[tp_size * self.tp_rank_ : tp_size * (self.tp_rank_ + 1)]
+        assert (
+            weight.shape[0] * self.repeat_times_ % self.tp_world_size_ == 0
+        ), f"tp slice error {weight.shape[0] * self.repeat_times_} % {self.tp_world_size_}"
+        start, end = self._get_slice_start_end(weight.shape[0])
+        return weight[start:end, :]
 
     def _slice_bias(self, bias: torch.Tensor) -> torch.Tensor:
-        assert bias.shape[0] % self.tp_world_size_ == 0, f"tp slice error {bias.shape[0]} % {self.tp_world_size_}"
-        tp_size = bias.shape[0] // self.tp_world_size_
-        return bias[tp_size * self.tp_rank_ : tp_size * (self.tp_rank_ + 1)]
+        assert (
+            bias.shape[0] * self.repeat_times_ % self.tp_world_size_ == 0
+        ), f"tp slice error {bias.shape[0] * self.repeat_times_} % {self.tp_world_size_}"
+        start, end = self._get_slice_start_end(bias.shape[0])
+        return bias[start:end]
 
 
 # 量化切片默认实现方式是group-wise的量化，所以weight_scale 和weight_zero_point ndims跟weight一样。
 # 后续按需要，扩展per-tensor、per-channel的量化方式。
 class QuantizedRowSliceMixin(RowSliceMixin):
-    def __init__(self, tp_rank: int = None, tp_world_size: int = None):
-        super().__init__(tp_rank, tp_world_size)
+    def __init__(self, tp_rank: int = None, tp_world_size: int = None, repeat_times: int = 1):
+        super().__init__(tp_rank, tp_world_size, repeat_times)
 
     def _slice_weight_scale(self, weight_scale: torch.Tensor) -> torch.Tensor:
         assert (
             weight_scale.shape[0] % self.tp_world_size_ == 0
         ), f"tp slice error {weight_scale.shape[0]} % {self.tp_world_size_}"
-        tp_size = weight_scale.shape[0] // self.tp_world_size_
-        scale_start = tp_size * self.tp_rank_
-        scale_end = tp_size * (self.tp_rank_ + 1)
-        return weight_scale[scale_start:scale_end]
+        start, end = self._get_slice_start_end(weight_scale.shape[0])
+        return weight_scale[start:end]
 
     def _slice_weight_zero_point(self, weight_zero_point: torch.Tensor) -> torch.Tensor:
         assert (
             weight_zero_point.shape[0] % self.tp_world_size_ == 0
         ), f"tp slice error {weight_zero_point.shape[0]} % {self.tp_world_size_}"
-        tp_size = weight_zero_point.shape[0] // self.tp_world_size_
-        zero_point_start = tp_size * self.tp_rank_
-        zero_point_end = tp_size * (self.tp_rank_ + 1)
-        return weight_zero_point[zero_point_start:zero_point_end]
+        start, end = self._get_slice_start_end(weight_zero_point.shape[0])
+        return weight_zero_point[start:end]
 
 
 class ColSliceMixin(SliceMixinTpl):
-    def __init__(self, tp_rank: int = None, tp_world_size: int = None):
-        super().__init__(tp_rank, tp_world_size)
+    def __init__(self, tp_rank: int = None, tp_world_size: int = None, repeat_times: int = 1):
+        super().__init__(tp_rank, tp_world_size, repeat_times)
 
     def _slice_weight(self, weight: torch.Tensor) -> torch.Tensor:
-        assert weight.shape[1] % self.tp_world_size_ == 0, f"tp slice error {weight.shape[1]} % {self.tp_world_size_}"
-        tp_size = weight.shape[1] // self.tp_world_size_
-        return weight[:, tp_size * self.tp_rank_ : tp_size * (self.tp_rank_ + 1)]
+        assert (
+            weight.shape[1] * self.repeat_times_ % self.tp_world_size_ == 0
+        ), f"tp slice error {weight.shape[1] * self.repeat_times_ } % {self.tp_world_size_}"
+        start, end = self._get_slice_start_end(weight.shape[1])
+        return weight[:, start:end]
 
     def _slice_bias(self, bias: torch.Tensor) -> torch.Tensor:
-        return bias / self.tp_world_size_
+        return bias / self.tp_world_size_ * self.repeat_times_
 
 
 class QuantizedColSliceMixin(ColSliceMixin):
-    def __init__(self, tp_rank: int = None, tp_world_size: int = None):
-        super().__init__(tp_rank, tp_world_size)
+    def __init__(self, tp_rank: int = None, tp_world_size: int = None, repeat_times: int = 1):
+        super().__init__(tp_rank, tp_world_size, repeat_times)
 
     def _slice_weight_scale(self, weight_scale: torch.Tensor) -> torch.Tensor:
         assert (
-            weight_scale.shape[1] % self.tp_world_size_ == 0
-        ), f"tp slice error {weight_scale.shape[1]} % {self.tp_world_size_}"
-        tp_size = weight_scale.shape[1] // self.tp_world_size_
-        scale_start = tp_size * self.tp_rank_
-        scale_end = tp_size * (self.tp_rank_ + 1)
-        return weight_scale[:, scale_start:scale_end]
+            weight_scale.shape[1] * self.repeat_times_ % self.tp_world_size_ == 0
+        ), f"tp slice error {weight_scale.shape[1] * self.repeat_times_ } % {self.tp_world_size_}"
+        start, end = self._get_slice_start_end(weight_scale.shape[1])
+        return weight_scale[:, start:end]
 
     def _slice_weight_zero_point(self, weight_zero_point: torch.Tensor) -> torch.Tensor:
         assert (
-            weight_zero_point.shape[1] % self.tp_world_size_ == 0
-        ), f"tp slice error {weight_zero_point.shape[1]} % {self.tp_world_size_}"
-        tp_size = weight_zero_point.shape[1] // self.tp_world_size_
-        zero_point_start = tp_size * self.tp_rank_
-        zero_point_end = tp_size * (self.tp_rank_ + 1)
-        return weight_zero_point[:, zero_point_start:zero_point_end]
+            weight_zero_point.shape[1] * self.repeat_times_ % self.tp_world_size_ == 0
+        ), f"tp slice error {weight_zero_point.shape[1] * self.repeat_times_ } % {self.tp_world_size_}"
+        start, end = self._get_slice_start_end(weight_zero_point.shape[1])
+        return weight_zero_point[:, start:end]
 
 
 # awq 的量化权重是inxout存储格式，需要定制实现。
 class AwqQuantizedRowSliceMixin(QuantizedColSliceMixin):
-    def __init__(self, tp_rank: int = None, tp_world_size: int = None):
-        super().__init__(tp_rank, tp_world_size)
+    def __init__(self, tp_rank: int = None, tp_world_size: int = None, repeat_times: int = 1):
+        super().__init__(tp_rank, tp_world_size, repeat_times)
 
     def _slice_bias(self, bias: torch.Tensor) -> torch.Tensor:
-        assert bias.shape[0] % self.tp_world_size_ == 0, f"tp slice error {bias.shape[0]} % {self.tp_world_size_}"
-        tp_size = bias.shape[0] // self.tp_world_size_
-        return bias[tp_size * self.tp_rank_ : tp_size * (self.tp_rank_ + 1)]
+        assert (
+            bias.shape[0] * self.repeat_times_ % self.tp_world_size_ == 0
+        ), f"tp slice error {bias.shape[0] * self.repeat_times_ } % {self.tp_world_size_}"
+        start, end = self._get_slice_start_end(bias.shape[0])
+        return bias[start:end]
 
 
 class AwqQuantizedColSliceMixin(QuantizedRowSliceMixin):
-    def __init__(self, tp_rank: int = None, tp_world_size: int = None):
-        super().__init__(tp_rank, tp_world_size)
+    def __init__(self, tp_rank: int = None, tp_world_size: int = None, repeat_times: int = 1):
+        super().__init__(tp_rank, tp_world_size, repeat_times)
 
     def _slice_bias(self, bias: torch.Tensor) -> torch.Tensor:
-        return bias / self.tp_world_size_
+        return bias / self.tp_world_size_ * self.repeat_times_
 
 
-def get_row_slice_mixin(quant_method_name: str, tp_rank: int = None, tp_world_size: int = None) -> SliceMixinTpl:
+def get_row_slice_mixin(
+    quant_method_name: str, tp_rank: int = None, tp_world_size: int = None, repeat_times: int = 1
+) -> SliceMixinTpl:
     if quant_method_name.startswith("awq"):
-        return AwqQuantizedRowSliceMixin(tp_rank, tp_world_size)
+        return AwqQuantizedRowSliceMixin(tp_rank, tp_world_size, repeat_times)
     elif quant_method_name == "none":
-        return RowSliceMixin(tp_rank, tp_world_size)
+        return RowSliceMixin(tp_rank, tp_world_size, repeat_times)
     else:
-        return QuantizedRowSliceMixin(tp_rank, tp_world_size)
+        return QuantizedRowSliceMixin(tp_rank, tp_world_size, repeat_times)
 
 
-def get_col_slice_mixin(quant_method_name: str, tp_rank: int = None, tp_world_size: int = None) -> SliceMixinTpl:
+def get_col_slice_mixin(
+    quant_method_name: str, tp_rank: int = None, tp_world_size: int = None, repeat_times: int = 1
+) -> SliceMixinTpl:
     if quant_method_name.startswith("awq"):
-        return AwqQuantizedColSliceMixin(tp_rank, tp_world_size)
+        return AwqQuantizedColSliceMixin(tp_rank, tp_world_size, repeat_times)
     elif quant_method_name == "none":
-        return ColSliceMixin(tp_rank, tp_world_size)
+        return ColSliceMixin(tp_rank, tp_world_size, repeat_times)
     else:
-        return QuantizedColSliceMixin(tp_rank, tp_world_size)
+        return QuantizedColSliceMixin(tp_rank, tp_world_size, repeat_times)
diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/mm_weight.py b/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/mm_weight.py
index 92236b798..2bb7193c5 100644
--- a/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/mm_weight.py
+++ b/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/mm_weight.py
@@ -57,15 +57,6 @@ def __init__(
         self._create_weight()
         self.gen_weight_quant_param_names(quant_method=quant_method)
 
-    def _create_weight(self):
-        self.bias = None
-        if self.bias_names is not None:
-            self.bias = torch.empty(self.cusum_out_dims[-1], dtype=self.data_type_).cuda(get_current_device_id())
-        self.mm_param: WeightPack = self.quant_method.create_weight(
-            in_dim=self.in_dim, out_dim=sum(self.out_dims), dtype=self.data_type_, device_id=get_current_device_id()
-        )
-        return
-
     def mm(
         self, input_tensor: torch.Tensor, out: Optional[torch.Tensor] = None, use_custom_tensor_mananger: bool = True
     ) -> torch.Tensor:
@@ -133,6 +124,15 @@ def load_hf_weights(self, weights):
     def verify_load(self) -> bool:
         return True
 
+    def _create_weight(self):
+        self.bias = None
+        if self.bias_names is not None:
+            self.bias = torch.empty(self.cusum_out_dims[-1], dtype=self.data_type_).cuda(get_current_device_id())
+        self.mm_param: WeightPack = self.quant_method.create_weight(
+            in_dim=self.in_dim, out_dim=sum(self.out_dims), dtype=self.data_type_, device_id=get_current_device_id()
+        )
+        return
+
     # 执行顺序
     def _load_weight(
         self, param_name: Union[str, List[str]], weights: Dict[str, torch.Tensor], sub_child_index: int
@@ -174,26 +174,8 @@ def _load_weight_zero_point(
             self.quant_method.load_weight_zero_point(weight_zero_point, self.mm_param, start_idx)
         return
 
-
-class BMMWeightTpl(MMWeightTpl):
-    def mm(
-        self, input_tensor: torch.Tensor, out: Optional[torch.Tensor] = None, use_custom_tensor_mananger: bool = True
-    ) -> torch.Tensor:
-        raise RuntimeError("use bmm not mm")
-
-    def bmm(
-        self, input_tensor: torch.Tensor, out: Optional[torch.Tensor] = None, use_custom_tensor_mananger: bool = True
-    ) -> torch.Tensor:
-        # 目前 bmm 不支持量化运算操作
-        fpweight = self.mm_param.weight
-        if out is None:
-            shape = (input_tensor.shape[0], input_tensor.shape[1], fpweight.shape[2])
-            dtype = input_tensor.dtype
-            device = input_tensor.device
-            if use_custom_tensor_mananger:
-                out = g_cache_manager.alloc_tensor(shape, dtype, device=device)
-            else:
-                out = torch.empty(shape, dtype=dtype, device=device)
-        if self.bias is None:
-            return torch.bmm(input_tensor, fpweight, out=out)
-        return torch.addbmm(self.bias, input_tensor, fpweight, out=out)
+    def _get_tp_dim(self, dim: int) -> int:
+        assert (
+            dim % self.tp_world_size_ == 0
+        ), f"dim must be divisible by tp_world_size_, but found: {dim} % {self.tp_world_size_}"
+        return dim // self.tp_world_size_
diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/rowmm_weight.py b/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/rowmm_weight.py
index e53d643ce..e73b0cecb 100644
--- a/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/rowmm_weight.py
+++ b/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/rowmm_weight.py
@@ -1,12 +1,12 @@
 import torch
 from lightllm.common.basemodel.layer_weights.meta_weights.mm_weight.mm_weight import (
     MMWeightTpl,
-    BMMWeightTpl,
 )
 from lightllm.common.quantization import Quantcfg
 from lightllm.utils.dist_utils import get_current_device_id
 from lightllm.common.quantization.quantize_method import QuantizationMethod
 from typing import Dict, List, Optional, Union
+from lightllm.utils.dist_utils import get_current_rank_in_dp, get_dp_world_size
 from .mm_slicer import get_row_slice_mixin
 
 
@@ -22,6 +22,9 @@ def __init__(
         tp_rank: int = None,
         tp_world_size: int = None,
     ) -> None:
+        self.tp_rank_ = tp_rank if tp_rank is not None else get_current_rank_in_dp()
+        self.tp_world_size_ = tp_world_size if tp_world_size is not None else get_dp_world_size()
+        out_dims = [self._get_tp_dim(out_dim) for out_dim in out_dims]
         super().__init__(
             in_dim=in_dim,
             out_dims=out_dims,
@@ -29,17 +32,20 @@ def __init__(
             bias_names=bias_names,
             data_type=data_type,
             quant_method=quant_method,
-            tp_rank=tp_rank,
-            tp_world_size=tp_world_size,
+            tp_rank=self.tp_rank_,
+            tp_world_size=self.tp_world_size_,
         )
         self.param_slicer = get_row_slice_mixin(
-            self.quant_method.method_name, tp_rank=tp_rank, tp_world_size=tp_world_size
+            self.quant_method.method_name, tp_rank=self.tp_rank_, tp_world_size=self.tp_world_size_
         )
 
 
-class ROWBMMWeight(BMMWeightTpl):
+class KVROWNMMWeight(MMWeightTpl):
     def __init__(
         self,
+        in_dim: int,
+        kv_head_num: int,
+        head_dim: int,
         weight_names: Union[str, List[str]],
         data_type: torch.dtype,
         bias_names: Optional[Union[str, List[str]]] = None,
@@ -47,13 +53,42 @@ def __init__(
         tp_rank: int = None,
         tp_world_size: int = None,
     ) -> None:
+        self.tp_rank = tp_rank if tp_rank is not None else get_current_rank_in_dp()
+        self.tp_world_size = tp_world_size if tp_world_size is not None else get_dp_world_size()
+        self.repeat_times = 1
+        assert kv_head_num % self.tp_world_size_ == 0 or self.tp_world_size_ % kv_head_num == 0, (
+            f"kv_head_num must be divisible by tp_world_size_ or "
+            f"tp_world_size_ must be divisible by kv_head_num, "
+            f"but found: {kv_head_num} % {self.tp_world_size_}"
+        )
+        kv_hidden_size = self._get_tp_padded_head_num(kv_head_num) * head_dim
+        out_dims = [kv_hidden_size, kv_hidden_size]
         super().__init__(
+            in_dim=in_dim,
+            out_dims=out_dims,
             weight_names=weight_names,
             data_type=data_type,
             bias_names=bias_names,
             quant_method=quant_method,
-            tp_rank=tp_rank,
-            tp_world_size=tp_world_size,
+            tp_rank=self.tp_rank,
+            tp_world_size=self.tp_world_size,
         )
-        # bmm 不支持量化运算操作
-        self.param_slicer = get_row_slice_mixin(quant_method_name="none", tp_rank=tp_rank, tp_world_size=tp_world_size)
+        self.param_slicer = get_row_slice_mixin(
+            self.quant_method.method_name,
+            tp_rank=self.tp_rank,
+            tp_world_size=self.tp_world_size,
+            repeat_times=self.repeat_times,
+        )
+
+    def _get_tp_padded_head_num(self, head_num: int):
+        if head_num % self.tp_world_size_ == 0:
+            return head_num // self.tp_world_size_
+        elif self.tp_world_size_ % head_num == 0:
+            self.repeat_times = self.tp_world_size_ // head_num
+            return self.repeat_times * head_num // self.tp_world_size_
+        else:
+            raise ValueError(
+                f"head_num must be divisible by tp_world_size_ or "
+                f"tp_world_size_ must be divisible by head_num, "
+                f"but found: {head_num} % {self.tp_world_size_}"
+            )
diff --git a/lightllm/models/llama/layer_weights/transformer_layer_weight.py b/lightllm/models/llama/layer_weights/transformer_layer_weight.py
index 426230e14..23ecbbabd 100644
--- a/lightllm/models/llama/layer_weights/transformer_layer_weight.py
+++ b/lightllm/models/llama/layer_weights/transformer_layer_weight.py
@@ -2,7 +2,7 @@
 import math
 import numpy as np
 from lightllm.common.basemodel import TransformerLayerWeight
-from lightllm.common.basemodel.layer_weights.meta_weights import ROWMMWeight, COLMMWeight, RMSNormWeight
+from lightllm.common.basemodel.layer_weights.meta_weights import ROWMMWeight, COLMMWeight, RMSNormWeight, KVROWNMMWeight
 
 
 class LlamaTransformerLayerWeight(TransformerLayerWeight):
@@ -23,16 +23,15 @@ def _init_weight(self):
         self._init_norm()
 
     def _parse_config(self):
-        self.tp_q_head_num_ = self.network_config_["num_attention_heads"] // self.tp_world_size_
-        self.tp_k_head_num_ = max(self.network_config_["num_key_value_heads"] // self.tp_world_size_, 1)
-        self.tp_v_head_num_ = self.tp_k_head_num_
-        self.tp_o_head_num_ = self.tp_q_head_num_
+        self.n_head = self.network_config_["num_attention_heads"]
+        self.q_head_num_ = self.network_config_["num_attention_heads"]
+        self.k_head_num_ = self.network_config_["num_key_value_heads"]
+        self.v_head_num_ = self.k_head_num_
+        self.o_head_num_ = self.q_head_num_
         head_dim = self.network_config_["hidden_size"] // self.network_config_["num_attention_heads"]
         self.head_dim = self.network_config_.get("head_dim", head_dim)
-        assert (self.tp_k_head_num_ * self.tp_world_size_) % self.network_config_["num_key_value_heads"] == 0
         self.n_embed = self.network_config_["hidden_size"]
         self.n_inter = self.network_config_["intermediate_size"]
-        self.n_head = self.network_config_["num_attention_heads"]
 
     def _init_weight_names(self):
         self._q_weight_name = f"model.layers.{self.layer_num_}.self_attn.q_proj.weight"
@@ -62,9 +61,7 @@ def _init_weight_names(self):
 
     def _init_qkv(self):
         in_dim = self.n_embed
-        q_out_dim = self.tp_q_head_num_ * self.head_dim
-        k_out_dim = self.tp_k_head_num_ * self.head_dim
-        v_out_dim = self.tp_v_head_num_ * self.head_dim
+        q_out_dim = self.q_head_num_ * self.head_dim
         self.q_proj = ROWMMWeight(
             in_dim=in_dim,
             out_dims=[q_out_dim],
@@ -73,9 +70,10 @@ def _init_qkv(self):
             bias_names=self._q_bias_name,
             quant_method=self.get_quant_method("q_proj"),
         )
-        self.kv_proj = ROWMMWeight(
+        self.kv_proj = KVROWNMMWeight(
             in_dim=in_dim,
-            out_dims=[k_out_dim, v_out_dim],
+            kv_head_num=self.k_head_num_,
+            head_dim=self.head_dim,
             weight_names=[self._k_weight_name, self._v_weight_name],
             data_type=self.data_type_,
             bias_names=[self._k_bias_name, self._v_bias_name],
@@ -83,7 +81,7 @@ def _init_qkv(self):
         )
 
     def _init_o(self):
-        in_dim = self.tp_o_head_num_ * self.head_dim
+        in_dim = self.o_head_num_ * self.head_dim
         out_dim = self.n_embed
         self.o_proj = COLMMWeight(
             in_dim=in_dim,
diff --git a/lightllm/models/qwen2/layer_weights/transformer_layer_weight.py b/lightllm/models/qwen2/layer_weights/transformer_layer_weight.py
index 74cf6c600..fe6a5a2d4 100644
--- a/lightllm/models/qwen2/layer_weights/transformer_layer_weight.py
+++ b/lightllm/models/qwen2/layer_weights/transformer_layer_weight.py
@@ -10,33 +10,3 @@ def _init_weight_names(self):
         self._q_bias_name = f"model.layers.{self.layer_num_}.self_attn.q_proj.bias"
         self._k_bias_name = f"model.layers.{self.layer_num_}.self_attn.k_proj.bias"
         self._v_bias_name = f"model.layers.{self.layer_num_}.self_attn.v_proj.bias"
-
-    def _repeat_weight(self, name, weights):
-        # for tp_world_size_ > num_key_value_heads
-        if name not in weights:
-            return
-
-        tensor = weights[name]
-        num_kv_heads = self.network_config_["num_key_value_heads"]
-        repeat_size = (self.tp_k_head_num_ * self.tp_world_size_) // num_kv_heads
-
-        if tensor.ndim == 1:
-            # Bias (1D tensor)
-            tensor = tensor.reshape(num_kv_heads, -1).unsqueeze(1).repeat(1, repeat_size, 1).reshape(-1)
-        else:
-            # Weight (2D tensor)
-            tensor = (
-                tensor.reshape(num_kv_heads, -1, tensor.shape[-1])
-                .unsqueeze(1)
-                .repeat(1, repeat_size, 1, 1)
-                .reshape(-1, tensor.shape[-1])
-            )
-        weights[name] = tensor
-
-    def load_hf_weights(self, weights):
-        self._repeat_weight(self._k_weight_name, weights)
-        self._repeat_weight(self._v_weight_name, weights)
-        if self._k_bias_name is not None and self._v_bias_name is not None:
-            self._repeat_weight(self._k_bias_name, weights)
-            self._repeat_weight(self._v_bias_name, weights)
-        return super().load_hf_weights(weights)

From 1e40cb74ba48fe281fa7e73313f229f5534d390f Mon Sep 17 00:00:00 2001
From: sufubao <sufubao@sensetime.com>
Date: Mon, 12 Jan 2026 08:24:57 +0000
Subject: [PATCH 05/65] Embedding and LMHead

---
 .../meta_weights/embedding_weight.py          | 144 ++++++++++++++----
 .../layer_weights/meta_weights/norm_weight.py |   4 +-
 .../bloom/layer_infer/post_layer_infer.py     |   4 +-
 .../bloom/layer_infer/pre_layer_infer.py      |   6 +-
 .../layer_infer/transformer_layer_infer.py    |   4 +-
 .../pre_and_post_layer_weight.py              |   8 +-
 .../layer_infer/transformer_layer_infer.py    |  30 +---
 .../layer_infer/pre_layer_infer.py            |  24 +--
 .../layer_infer/transformer_layer_infer.py    |  26 ++--
 .../pre_and_post_layer_weight.py              |   4 +
 .../gemma_2b/layer_infer/pre_layer_infer.py   |  10 +-
 .../pre_and_post_layer_weight.py              |   4 +
 .../pre_and_post_layer_weight.py              |  15 +-
 .../pre_and_post_layer_weight.py              |   4 +-
 .../llama/layer_infer/post_layer_infer.py     |   4 +-
 .../llama/layer_infer/pre_layer_infer.py      |   4 +-
 .../layer_infer/transformer_layer_infer.py    |   8 +-
 .../pre_and_post_layer_weight.py              |  16 +-
 .../layer_infer/pre_layer_infer.py            |  36 +----
 .../pre_and_post_layer_weight.py              |   7 +-
 .../layer_infer/transformer_layer_infer.py    |  16 +-
 .../pre_and_post_layer_weight.py              |  17 ++-
 .../layer_infer/transformer_layer_infer.py    |  12 +-
 .../starcoder/layer_infer/pre_layer_infer.py  |  14 +-
 .../pre_and_post_layer_weight.py              |   8 +
 .../layer_infer/transformer_layer_infer.py    |  12 +-
 .../pre_and_post_layer_weight.py              |  18 +--
 27 files changed, 240 insertions(+), 219 deletions(-)

diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/embedding_weight.py b/lightllm/common/basemodel/layer_weights/meta_weights/embedding_weight.py
index e9b9176dd..d1de857cd 100644
--- a/lightllm/common/basemodel/layer_weights/meta_weights/embedding_weight.py
+++ b/lightllm/common/basemodel/layer_weights/meta_weights/embedding_weight.py
@@ -2,7 +2,7 @@
 import numpy as np
 from typing import Dict, Optional
 from .base_weight import BaseWeightTpl
-from lightllm.utils.dist_utils import get_current_device_id
+from .platform_op import PlatformAwareOp
 from lightllm.common.basemodel.triton_kernel.embedding import embedding as embedding_kernel
 from lightllm.utils.dist_utils import get_dp_world_size, get_current_rank_in_dp
 from lightllm.utils.log_utils import init_logger
@@ -10,7 +10,7 @@
 logger = init_logger(__name__)
 
 
-class EmbeddingWeight(BaseWeightTpl):
+class EmbeddingWeight(BaseWeightTpl, PlatformAwareOp):
     def __init__(self, dim: int, vocab_size: int, weight_name: str, data_type: torch.dtype):
         super().__init__()
         self.dim = dim
@@ -23,14 +23,14 @@ def __init__(self, dim: int, vocab_size: int, weight_name: str, data_type: torch
         self.tp_vocab_end_id = int(split_indexes[self.tp_rank_ + 1])
         self.weight_name: str = weight_name
         self.data_type_ = data_type
-        self.weight: torch.Tensor = None
+        self._create_weight()
 
     def _create_weight(self):
         tp_vocab_size = self.tp_vocab_end_id - self.tp_vocab_start_id
         self.weight: torch.Tensor = torch.empty(tp_vocab_size, self.dim, dtype=self.data_type_, device=self.device_id_)
 
     def load_hf_weights(self, weights: Dict[str, torch.Tensor]):
-        if self.weight_name not in weights or self.weight is not None:
+        if self.weight_name not in weights:
             return
         t_weight = weights[self.weight_name]
         # init some params
@@ -39,16 +39,29 @@ def load_hf_weights(self, weights: Dict[str, torch.Tensor]):
             loaded_vocab_size == self.vocab_size
         ), f"loaded weight vocab_size: {loaded_vocab_size} != expected vocab_size: {self.vocab_size}"
         logger.info(f"loaded weight vocab_size: {self.vocab_size}")
-        self.weight.copy_(
-            t_weight[self.tp_vocab_start_id : self.tp_vocab_end_id, :].to(self.data_type_).cuda(get_current_device_id())
-        )
-
-    def embedding(self, input_ids: torch.Tensor, out: Optional[torch.Tensor] = None, alloc_func=torch.empty):
+        self.weight.copy_(t_weight[self.tp_vocab_start_id : self.tp_vocab_end_id, :].to(self.data_type_))
+
+    def _native_forward(
+        self, input_ids: torch.Tensor, out: Optional[torch.Tensor] = None, _alloc_func=torch.empty
+    ) -> torch.Tensor:
+        # Adjust input_ids for tp split
+        adjusted_ids = input_ids - self.tp_vocab_start_id
+        # Clamp to valid range for this partition
+        adjusted_ids = torch.clamp(adjusted_ids, 0, self.weight.shape[0] - 1)
+        # Use PyTorch native embedding
+        result = torch.nn.functional.embedding(adjusted_ids, self.weight)
+        if out is not None:
+            out.copy_(result)
+            return out
+        return result
+
+    def _cuda_forward(
+        self, input_ids: torch.Tensor, out: Optional[torch.Tensor] = None, alloc_func=torch.empty
+    ) -> torch.Tensor:
         if out is None:
             out = alloc_func(
                 (input_ids.shape[0], self.weight.shape[1]), dtype=self.weight.dtype, device=self.weight.device
             )
-
         embedding_kernel(
             input_ids=input_ids,
             weight=self.weight,
@@ -56,10 +69,57 @@ def embedding(self, input_ids: torch.Tensor, out: Optional[torch.Tensor] = None,
             vob_end_id=self.tp_vocab_end_id,
             out=out,
         )
-
         return out
 
-    def lm_head(self, input: torch.Tensor, out: Optional[torch.Tensor] = None, alloc_func=torch.empty):
+    def __call__(
+        self, input_ids: torch.Tensor, out: Optional[torch.Tensor] = None, alloc_func=torch.empty
+    ) -> torch.Tensor:
+        return self._forward(input_ids=input_ids, out=out, alloc_func=alloc_func)
+
+
+class LMHeadWeight(BaseWeightTpl, PlatformAwareOp):
+    def __init__(self, dim: int, vocab_size: int, weight_name: str, data_type: torch.dtype):
+        super().__init__()
+        self.dim = dim
+        self.vocab_size = vocab_size
+        self.tp_world_size_ = get_dp_world_size()
+        self.tp_rank_ = get_current_rank_in_dp()
+        # 计算 split_indexes
+        split_indexes = np.linspace(0, self.vocab_size, self.tp_world_size_ + 1, dtype=np.int64)
+        self.tp_vocab_start_id = int(split_indexes[self.tp_rank_])
+        self.tp_vocab_end_id = int(split_indexes[self.tp_rank_ + 1])
+        self.weight_name: str = weight_name
+        self.data_type_ = data_type
+        self._create_weight()
+
+    def _create_weight(self):
+        tp_vocab_size = self.tp_vocab_end_id - self.tp_vocab_start_id
+        self.weight: torch.Tensor = torch.empty(tp_vocab_size, self.dim, dtype=self.data_type_, device=self.device_id_)
+
+    def load_hf_weights(self, weights: Dict[str, torch.Tensor]):
+        if self.weight_name not in weights:
+            return
+        t_weight = weights[self.weight_name]
+        loaded_vocab_size = len(t_weight)
+        assert (
+            loaded_vocab_size == self.vocab_size
+        ), f"loaded weight vocab_size: {loaded_vocab_size} != expected vocab_size: {self.vocab_size}"
+        logger.info(f"loaded weight vocab_size: {self.vocab_size}")
+        self.weight.copy_(t_weight[self.tp_vocab_start_id : self.tp_vocab_end_id, :].to(self.data_type_))
+
+    def _native_forward(
+        self, input: torch.Tensor, out: Optional[torch.Tensor] = None, _alloc_func=torch.empty
+    ) -> torch.Tensor:
+        assert input.ndim == 2
+        result = torch.mm(self.weight, input)
+        if out is not None:
+            out.copy_(result)
+            return out
+        return result
+
+    def _cuda_forward(
+        self, input: torch.Tensor, out: Optional[torch.Tensor] = None, alloc_func=torch.empty
+    ) -> torch.Tensor:
         assert input.ndim == 2
         if out is None:
             out = alloc_func(
@@ -67,49 +127,67 @@ def lm_head(self, input: torch.Tensor, out: Optional[torch.Tensor] = None, alloc
                 dtype=input.dtype,
                 device=input.device,
             )
-
         torch.mm(self.weight, input, out=out)
         return out
 
-
-class LMHeadWeight(EmbeddingWeight):
-    def __init__(self, weight_name, data_type):
-        super().__init__(weight_name, data_type)
+    def __call__(self, input: torch.Tensor, out: Optional[torch.Tensor] = None, alloc_func=torch.empty) -> torch.Tensor:
+        return self._forward(input=input, out=out, alloc_func=alloc_func)
 
 
-class NoTpPosEmbeddingWeight(BaseWeightTpl):
-    def __init__(self, weight_name, data_type):
+class NoTpPosEmbeddingWeight(BaseWeightTpl, PlatformAwareOp):
+    def __init__(self, dim: int, max_position_embeddings: int, weight_name: str, data_type: torch.dtype):
         super().__init__()
+        self.dim = dim
+        self.max_position_embeddings = max_position_embeddings
         self.weight_name: str = weight_name
         self.data_type_ = data_type
-        self.weight: torch.Tensor = None
         self.tp_world_size_ = 1
         self.tp_rank_ = 0
+        self._create_weight()
+
+    def _create_weight(self):
+        self.weight: torch.Tensor = torch.empty(
+            self.max_position_embeddings, self.dim, dtype=self.data_type_, device=self.device_id_
+        )
 
     def load_hf_weights(self, weights: Dict[str, torch.Tensor]):
-        if self.weight_name not in weights or self.weight is not None:
+        if self.weight_name not in weights:
             return
-
         t_weight = weights[self.weight_name]
-        self.weight = t_weight.to(self.data_type_).cuda(get_current_device_id())
-        self.end_position_id: int = t_weight.shape[0]
-        logger.info(f"loaded weight end_position_id: {self.end_position_id}")
-
-    def verify_load(self):
-        return self.weight is not None
-
-    def embedding(self, input_ids: torch.Tensor, out: Optional[torch.Tensor] = None, alloc_func=torch.empty):
+        loaded_max_position_embeddings = t_weight.shape[0]
+        assert (
+            loaded_max_position_embeddings == self.max_position_embeddings
+        ), f"max_position_embeddings: {loaded_max_position_embeddings} != expected: {self.max_position_embeddings}"
+        logger.info(f"loaded weight max_position_embeddings: {self.max_position_embeddings}")
+        self.weight.copy_(t_weight.to(self.data_type_))
+
+    def _native_forward(
+        self, input_ids: torch.Tensor, out: Optional[torch.Tensor] = None, _alloc_func=torch.empty
+    ) -> torch.Tensor:
+        # Use PyTorch native embedding
+        result = torch.nn.functional.embedding(input_ids, self.weight)
+        if out is not None:
+            out.copy_(result)
+            return out
+        return result
+
+    def _cuda_forward(
+        self, input_ids: torch.Tensor, out: Optional[torch.Tensor] = None, alloc_func=torch.empty
+    ) -> torch.Tensor:
         if out is None:
             out = alloc_func(
                 (input_ids.shape[0], self.weight.shape[1]), dtype=self.weight.dtype, device=self.weight.device
             )
-
         embedding_kernel(
             input_ids=input_ids,
             weight=self.weight,
             vob_start_id=0,
-            vob_end_id=self.end_position_id,
+            vob_end_id=self.max_position_embeddings,
             out=out,
         )
-
         return out
+
+    def __call__(
+        self, input_ids: torch.Tensor, out: Optional[torch.Tensor] = None, alloc_func=torch.empty
+    ) -> torch.Tensor:
+        return self._forward(input_ids=input_ids, out=out, alloc_func=alloc_func)
diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/norm_weight.py b/lightllm/common/basemodel/layer_weights/meta_weights/norm_weight.py
index 7b966600c..16a2e53da 100644
--- a/lightllm/common/basemodel/layer_weights/meta_weights/norm_weight.py
+++ b/lightllm/common/basemodel/layer_weights/meta_weights/norm_weight.py
@@ -51,7 +51,7 @@ def _cuda_forward(
             out = alloc_func(input.shape, dtype=input.dtype, device=input.device)
         return rmsnorm_forward(x=input, weight=self.weight, eps=eps, out=out)
 
-    def apply(
+    def __call__(
         self, input: torch.Tensor, eps: float, out: Optional[torch.Tensor] = None, alloc_func=torch.empty
     ) -> torch.Tensor:
         return self._forward(input=input, eps=eps, out=out, alloc_func=alloc_func)
@@ -101,7 +101,7 @@ def _cuda_forward(
             out = alloc_func(input.shape, dtype=input.dtype, device=input.device)
         return layernorm_forward(x=input, weight=self.weight, bias=self.bias, eps=eps, out=out)
 
-    def apply(
+    def __call__(
         self, input: torch.Tensor, eps: float, out: Optional[torch.Tensor] = None, alloc_func=torch.empty
     ) -> torch.Tensor:
         return self._forward(input=input, eps=eps, out=out, alloc_func=alloc_func)
diff --git a/lightllm/models/bloom/layer_infer/post_layer_infer.py b/lightllm/models/bloom/layer_infer/post_layer_infer.py
index f4fff116c..ec1d94458 100644
--- a/lightllm/models/bloom/layer_infer/post_layer_infer.py
+++ b/lightllm/models/bloom/layer_infer/post_layer_infer.py
@@ -16,6 +16,4 @@ def __init__(self, network_config):
         return
 
     def _norm(self, input, infer_state, layer_weight: BloomPreAndPostLayerWeight) -> torch.Tensor:
-        return layer_weight.final_norm_weight_.layernorm_forward(
-            input=input, eps=self.eps_, alloc_func=self.alloc_tensor
-        )
+        return layer_weight.final_norm_weight_(input=input, eps=self.eps_, alloc_func=self.alloc_tensor)
diff --git a/lightllm/models/bloom/layer_infer/pre_layer_infer.py b/lightllm/models/bloom/layer_infer/pre_layer_infer.py
index dfe396ab5..e84069e11 100644
--- a/lightllm/models/bloom/layer_infer/pre_layer_infer.py
+++ b/lightllm/models/bloom/layer_infer/pre_layer_infer.py
@@ -15,17 +15,17 @@ def __init__(self, network_config):
         return
 
     def _norm(self, input, infer_state, layer_weight: BloomPreAndPostLayerWeight) -> torch.Tensor:
-        return layer_weight.pre_norm_weight_.layernorm_forward(input=input, eps=self.eps_, alloc_func=self.alloc_tensor)
+        return layer_weight.pre_norm_weight_(input=input, eps=self.eps_, alloc_func=self.alloc_tensor)
 
     def context_forward(self, input_ids, infer_state: InferStateInfo, layer_weight: BloomPreAndPostLayerWeight):
-        input_embdings = layer_weight.wte_weight_.embedding(input_ids=input_ids, alloc_func=self.alloc_tensor)
+        input_embdings = layer_weight.wte_weight_(input_ids=input_ids, alloc_func=self.alloc_tensor)
         if self.tp_world_size_ > 1:
             all_reduce(input_embdings, group=infer_state.dist_group, op=dist.ReduceOp.SUM, async_op=False)
         input_embdings = self._norm(input_embdings, infer_state, layer_weight)
         return input_embdings
 
     def token_forward(self, input_ids, infer_state: InferStateInfo, layer_weight: BloomPreAndPostLayerWeight):
-        input_embdings = layer_weight.wte_weight_.embedding(input_ids=input_ids, alloc_func=self.alloc_tensor)
+        input_embdings = layer_weight.wte_weight_(input_ids=input_ids, alloc_func=self.alloc_tensor)
         if self.tp_world_size_ > 1:
             all_reduce(input_embdings, group=infer_state.dist_group, op=dist.ReduceOp.SUM, async_op=False)
         input_embdings = self._norm(input_embdings, infer_state, layer_weight)
diff --git a/lightllm/models/bloom/layer_infer/transformer_layer_infer.py b/lightllm/models/bloom/layer_infer/transformer_layer_infer.py
index 808788f71..60d584eeb 100755
--- a/lightllm/models/bloom/layer_infer/transformer_layer_infer.py
+++ b/lightllm/models/bloom/layer_infer/transformer_layer_infer.py
@@ -57,14 +57,14 @@ def _token_attention_kernel(
     def _att_norm(
         self, input: torch.Tensor, infer_state: InferStateInfo, layer_weight: BloomTransformerLayerWeight
     ) -> torch.Tensor:
-        return layer_weight.att_norm_weight_.layernorm_forward(
+        return layer_weight.att_norm_weight_(
             input=input.view(-1, self.embed_dim_), eps=self.eps_, alloc_func=self.alloc_tensor
         )
 
     def _ffn_norm(
         self, input: torch.Tensor, infer_state: InferStateInfo, layer_weight: BloomTransformerLayerWeight
     ) -> torch.Tensor:
-        return layer_weight.ffn_norm_weight_.layernorm_forward(
+        return layer_weight.ffn_norm_weight_(
             input=input.view(-1, self.embed_dim_), eps=self.eps_, alloc_func=self.alloc_tensor
         )
 
diff --git a/lightllm/models/bloom/layer_weights/pre_and_post_layer_weight.py b/lightllm/models/bloom/layer_weights/pre_and_post_layer_weight.py
index e02af4b4e..000a06912 100644
--- a/lightllm/models/bloom/layer_weights/pre_and_post_layer_weight.py
+++ b/lightllm/models/bloom/layer_weights/pre_and_post_layer_weight.py
@@ -1,5 +1,3 @@
-import torch
-import numpy as np
 from lightllm.common.basemodel import PreAndPostLayerWeight
 from lightllm.common.basemodel.layer_weights.meta_weights import EmbeddingWeight, LayerNormWeight
 
@@ -7,18 +5,24 @@
 class BloomPreAndPostLayerWeight(PreAndPostLayerWeight):
     def __init__(self, data_type, network_config):
         super().__init__(data_type, network_config)
+        hidden_size = network_config["hidden_size"]
+        vocab_size = network_config["vocab_size"]
         self.pre_norm_weight_ = LayerNormWeight(
+            dim=hidden_size,
             weight_name="word_embeddings_layernorm.weight",
             data_type=self.data_type_,
             bias_name="word_embeddings_layernorm.bias",
         )
         self.final_norm_weight_ = LayerNormWeight(
+            dim=hidden_size,
             weight_name="ln_f.weight",
             data_type=self.data_type_,
             bias_name="ln_f.bias",
         )
 
         self.wte_weight_ = EmbeddingWeight(
+            dim=hidden_size,
+            vocab_size=vocab_size,
             weight_name="word_embeddings.weight",
             data_type=self.data_type_,
         )
diff --git a/lightllm/models/deepseek2/layer_infer/transformer_layer_infer.py b/lightllm/models/deepseek2/layer_infer/transformer_layer_infer.py
index 8695f2de8..801ab6aba 100644
--- a/lightllm/models/deepseek2/layer_infer/transformer_layer_infer.py
+++ b/lightllm/models/deepseek2/layer_infer/transformer_layer_infer.py
@@ -165,20 +165,14 @@ def _get_qkv(
             q, cache_kv = layer_weight.qkv_a_proj_with_mqa_.mm(input).split(
                 [self.q_lora_rank, self.kv_lora_rank + self.qk_rope_head_dim], dim=-1
             )
-            q = layer_weight.q_a_layernorm_.rmsnorm_forward(
-                input=q,
-                eps=self.eps_,
-                alloc_func=self.alloc_tensor,
-            )
+            q = layer_weight.q_a_layernorm_(input=q, eps=self.eps_, alloc_func=self.alloc_tensor)
             q = layer_weight.q_b_proj_.mm(q)
             cache_kv = cache_kv.view(-1, 1, self.kv_lora_rank + self.qk_rope_head_dim)
         q = q.view(-1, self.tp_q_head_num_, self.qk_nope_head_dim + self.qk_rope_head_dim)
         q_nope, q_rope = torch.split(q, [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1)
 
-        layer_weight.kv_a_layernorm_.rmsnorm_forward(
-            cache_kv[:, :, : self.kv_lora_rank],
-            eps=self.eps_,
-            out=cache_kv[:, :, : self.kv_lora_rank],
+        layer_weight.kv_a_layernorm_(
+            cache_kv[:, :, : self.kv_lora_rank], eps=self.eps_, out=cache_kv[:, :, : self.kv_lora_rank]
         )
 
         rotary_emb_fwd(
@@ -208,10 +202,8 @@ def _tpsp_get_qkv(
             cache_kv = layer_weight.kv_a_proj_with_mqa_.mm(input).view(-1, 1, self.kv_lora_rank + self.qk_rope_head_dim)
             q = q.view(-1, self.tp_q_head_num_, self.qk_nope_head_dim + self.qk_rope_head_dim)
             q_nope, q_rope = torch.split(q, [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1)
-            layer_weight.kv_a_layernorm_.rmsnorm_forward(
-                cache_kv[:, :, : self.kv_lora_rank],
-                eps=self.eps_,
-                out=cache_kv[:, :, : self.kv_lora_rank],
+            layer_weight.kv_a_layernorm_(
+                cache_kv[:, :, : self.kv_lora_rank], eps=self.eps_, out=cache_kv[:, :, : self.kv_lora_rank]
             )
             rotary_emb_fwd(
                 q_rope,
@@ -244,19 +236,13 @@ def _tpsp_get_qkv(
                 position_sin = infer_state.position_sin
 
             q, cache_kv = qkv.split([self.q_lora_rank, self.kv_lora_rank + self.qk_rope_head_dim], dim=-1)
-            q = layer_weight.q_a_layernorm_.rmsnorm_forward(
-                q,
-                eps=self.eps_,
-                alloc_func=self.alloc_tensor,
-            )
+            q = layer_weight.q_a_layernorm_(input=q, eps=self.eps_, alloc_func=self.alloc_tensor)
             q = layer_weight.q_b_proj_.mm(q)
             cache_kv = cache_kv.view(-1, 1, self.kv_lora_rank + self.qk_rope_head_dim)
             q = q.view(-1, self.tp_q_head_num_, self.qk_nope_head_dim + self.qk_rope_head_dim)
             q_nope, q_rope = torch.split(q, [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1)
-            layer_weight.kv_a_layernorm_.rmsnorm_forward(
-                cache_kv[:, :, : self.kv_lora_rank],
-                eps=self.eps_,
-                out=cache_kv[:, :, : self.kv_lora_rank],
+            layer_weight.kv_a_layernorm_(
+                cache_kv[:, :, : self.kv_lora_rank], eps=self.eps_, out=cache_kv[:, :, : self.kv_lora_rank]
             )
             rotary_emb_fwd(
                 q_rope,
diff --git a/lightllm/models/deepseek_mtp/layer_infer/pre_layer_infer.py b/lightllm/models/deepseek_mtp/layer_infer/pre_layer_infer.py
index adb749c40..7e1224558 100644
--- a/lightllm/models/deepseek_mtp/layer_infer/pre_layer_infer.py
+++ b/lightllm/models/deepseek_mtp/layer_infer/pre_layer_infer.py
@@ -22,16 +22,8 @@ def _mtp_context_forward(
             input_embdings.shape[0] == tgt_embdings.shape[0]
         ), f"shape {input_embdings.shape} != shape {tgt_embdings.shape}"
 
-        layer_weight.enorm_weight_.rmsnorm_forward(
-            input=input_embdings,
-            eps=self.eps_,
-            out=input_embdings,
-        )
-        layer_weight.hnorm_weight_.rmsnorm_forward(
-            input=tgt_embdings,
-            eps=self.eps_,
-            out=tgt_embdings,
-        )
+        layer_weight.enorm_weight_(input=input_embdings, eps=self.eps_, out=input_embdings)
+        layer_weight.hnorm_weight_(input=tgt_embdings, eps=self.eps_, out=tgt_embdings)
         cat_embdings = torch.cat((input_embdings, tgt_embdings), dim=-1)
 
         ans_logics = layer_weight.eh_proj_weight_.mm(cat_embdings)
@@ -43,16 +35,8 @@ def _mtp_token_forward(
         tgt_embdings = infer_state.mtp_draft_input_hiddens
         assert input_embdings.shape[0] == tgt_embdings.shape[0]
 
-        layer_weight.enorm_weight_.rmsnorm_forward(
-            input=input_embdings,
-            eps=self.eps_,
-            out=input_embdings,
-        )
-        layer_weight.hnorm_weight_.rmsnorm_forward(
-            input=tgt_embdings,
-            eps=self.eps_,
-            out=tgt_embdings,
-        )
+        layer_weight.enorm_weight_(input=input_embdings, eps=self.eps_, out=input_embdings)
+        layer_weight.hnorm_weight_(input=tgt_embdings, eps=self.eps_, out=tgt_embdings)
         cat_embdings = torch.cat((input_embdings, tgt_embdings), dim=-1)
 
         ans_logics = layer_weight.eh_proj_weight_.mm(cat_embdings)
diff --git a/lightllm/models/gemma3/layer_infer/transformer_layer_infer.py b/lightllm/models/gemma3/layer_infer/transformer_layer_infer.py
index 1f386625b..183c4d8d4 100644
--- a/lightllm/models/gemma3/layer_infer/transformer_layer_infer.py
+++ b/lightllm/models/gemma3/layer_infer/transformer_layer_infer.py
@@ -37,14 +37,10 @@ def _get_qkv(
         q = q.view(-1, self.tp_q_head_num_, self.head_dim_)
         k = cache_kv[:, 0 : self.tp_k_head_num_, :]
 
-        q = layer_weight.q_norm_weight_.rmsnorm_forward(
-            input=q.float(), eps=self.eps_, alloc_func=self.alloc_tensor
-        ).to(cache_kv.dtype)
+        q = layer_weight.q_norm_weight_(input=q.float(), eps=self.eps_, alloc_func=self.alloc_tensor).to(cache_kv.dtype)
 
-        cache_kv[:, 0 : self.tp_k_head_num_, :] = layer_weight.k_norm_weight_.rmsnorm_forward(
-            input=k.float(),
-            eps=self.eps_,
-            alloc_func=self.alloc_tensor,
+        cache_kv[:, 0 : self.tp_k_head_num_, :] = layer_weight.k_norm_weight_(
+            input=k.float(), eps=self.eps_, alloc_func=self.alloc_tensor
         ).to(cache_kv.dtype)
 
         is_sliding = bool((self.layer_num_ + 1) % self.sliding_window_pattern)
@@ -92,7 +88,7 @@ def context_forward(self, input_embdings, infer_state: InferStateInfo, layer_wei
         input_embdings.add_(o.view(-1, self.embed_dim_))
         o = None
 
-        input1 = layer_weight.pre_feedforward_layernorm_weight_.rmsnorm_forward(
+        input1 = layer_weight.pre_feedforward_layernorm_weight_(
             input=input_embdings.float(), eps=self.eps_, alloc_func=self.alloc_tensor
         ).to(torch.bfloat16)
 
@@ -101,10 +97,8 @@ def context_forward(self, input_embdings, infer_state: InferStateInfo, layer_wei
         if self.tp_world_size_ > 1:
             all_reduce(ffn_out, op=dist.ReduceOp.SUM, group=infer_state.dist_group, async_op=False)
 
-        ffn_out = layer_weight.post_feedforward_layernorm_weight_.rmsnorm_forward(
-            input=ffn_out.float(),
-            eps=self.eps_,
-            alloc_func=self.alloc_tensor,
+        ffn_out = layer_weight.post_feedforward_layernorm_weight_(
+            input=ffn_out.float(), eps=self.eps_, alloc_func=self.alloc_tensor
         ).to(torch.bfloat16)
 
         input_embdings.add_(ffn_out.view(-1, self.embed_dim_))
@@ -127,7 +121,7 @@ def token_forward(self, input_embdings, infer_state: InferStateInfo, layer_weigh
         input_embdings.add_(o.view(-1, self.embed_dim_))
         o = None
 
-        input1 = layer_weight.pre_feedforward_layernorm_weight_.rmsnorm_forward(
+        input1 = layer_weight.pre_feedforward_layernorm_weight_(
             input=input_embdings.float(), eps=self.eps_, alloc_func=self.alloc_tensor
         ).to(torch.bfloat16)
 
@@ -136,10 +130,8 @@ def token_forward(self, input_embdings, infer_state: InferStateInfo, layer_weigh
         if self.tp_world_size_ > 1:
             all_reduce(ffn_out, op=dist.ReduceOp.SUM, group=infer_state.dist_group, async_op=False)
 
-        ffn_out = layer_weight.post_feedforward_layernorm_weight_.rmsnorm_forward(
-            input=ffn_out.float(),
-            eps=self.eps_,
-            alloc_func=self.alloc_tensor,
+        ffn_out = layer_weight.post_feedforward_layernorm_weight_(
+            input=ffn_out.float(), eps=self.eps_, alloc_func=self.alloc_tensor
         ).to(torch.bfloat16)
 
         input_embdings.add_(ffn_out.view(-1, self.embed_dim_))
diff --git a/lightllm/models/gemma3/layer_weights/pre_and_post_layer_weight.py b/lightllm/models/gemma3/layer_weights/pre_and_post_layer_weight.py
index 858937d8c..336aa2fc3 100644
--- a/lightllm/models/gemma3/layer_weights/pre_and_post_layer_weight.py
+++ b/lightllm/models/gemma3/layer_weights/pre_and_post_layer_weight.py
@@ -5,8 +5,12 @@
 class Gemma3PreAndPostLayerWeight(PreAndPostLayerWeight):
     def __init__(self, data_type, network_config):
         super().__init__(data_type, network_config)
+        hidden_size = network_config["hidden_size"]
+        vocab_size = network_config["vocab_size"]
 
         self.wte_weight_ = EmbeddingWeight(
+            dim=hidden_size,
+            vocab_size=vocab_size,
             weight_name="language_model.model.embed_tokens.weight",
             data_type=self.data_type_,
         )
diff --git a/lightllm/models/gemma_2b/layer_infer/pre_layer_infer.py b/lightllm/models/gemma_2b/layer_infer/pre_layer_infer.py
index 468d471d2..e21788d76 100644
--- a/lightllm/models/gemma_2b/layer_infer/pre_layer_infer.py
+++ b/lightllm/models/gemma_2b/layer_infer/pre_layer_infer.py
@@ -22,20 +22,14 @@ def _norm(self, input, infer_state, layer_weight: Gemma_2bPreAndPostLayerWeight)
         return input * self.normfactor
 
     def context_forward(self, input_ids, infer_state: LlamaInferStateInfo, layer_weight: Gemma_2bPreAndPostLayerWeight):
-        input_embdings = layer_weight.wte_weight_.embedding(
-            input_ids=input_ids,
-            alloc_func=self.alloc_tensor,
-        )
+        input_embdings = layer_weight.wte_weight_(input_ids=input_ids, alloc_func=self.alloc_tensor)
         if self.tp_world_size_ > 1:
             all_reduce(input_embdings, group=infer_state.dist_group, op=dist.ReduceOp.SUM, async_op=False)
         input_embdings = self._norm(input_embdings, infer_state, layer_weight)
         return input_embdings
 
     def token_forward(self, input_ids, infer_state: LlamaInferStateInfo, layer_weight: Gemma_2bPreAndPostLayerWeight):
-        input_embdings = layer_weight.wte_weight_.embedding(
-            input_ids=input_ids,
-            alloc_func=self.alloc_tensor,
-        )
+        input_embdings = layer_weight.wte_weight_(input_ids=input_ids, alloc_func=self.alloc_tensor)
         if self.tp_world_size_ > 1:
             all_reduce(input_embdings, group=infer_state.dist_group, op=dist.ReduceOp.SUM, async_op=False)
         input_embdings = self._norm(input_embdings, infer_state, layer_weight)
diff --git a/lightllm/models/gemma_2b/layer_weights/pre_and_post_layer_weight.py b/lightllm/models/gemma_2b/layer_weights/pre_and_post_layer_weight.py
index 6e052caa6..fbfb2ee75 100644
--- a/lightllm/models/gemma_2b/layer_weights/pre_and_post_layer_weight.py
+++ b/lightllm/models/gemma_2b/layer_weights/pre_and_post_layer_weight.py
@@ -5,8 +5,12 @@
 class Gemma_2bPreAndPostLayerWeight(PreAndPostLayerWeight):
     def __init__(self, data_type, network_config):
         super().__init__(data_type, network_config)
+        hidden_size = network_config["hidden_size"]
+        vocab_size = network_config["vocab_size"]
 
         self.wte_weight_ = EmbeddingWeight(
+            dim=hidden_size,
+            vocab_size=vocab_size,
             weight_name="model.embed_tokens.weight",
             data_type=self.data_type_,
         )
diff --git a/lightllm/models/internlm2/layer_weights/pre_and_post_layer_weight.py b/lightllm/models/internlm2/layer_weights/pre_and_post_layer_weight.py
index 7419d35e9..3bb526c79 100644
--- a/lightllm/models/internlm2/layer_weights/pre_and_post_layer_weight.py
+++ b/lightllm/models/internlm2/layer_weights/pre_and_post_layer_weight.py
@@ -5,9 +5,20 @@
 class Internlm2PreAndPostLayerWeight(PreAndPostLayerWeight):
     def __init__(self, data_type, network_config):
         super().__init__(data_type, network_config)
-        self.wte_weight_ = EmbeddingWeight(weight_name="model.tok_embeddings.weight", data_type=self.data_type_)
-        self.lm_head_weight_ = LMHeadWeight(weight_name="output.weight", data_type=self.data_type_)
         hidden_size = network_config["hidden_size"]
+        vocab_size = network_config["vocab_size"]
+        self.wte_weight_ = EmbeddingWeight(
+            dim=hidden_size,
+            vocab_size=vocab_size,
+            weight_name="model.tok_embeddings.weight",
+            data_type=self.data_type_,
+        )
+        self.lm_head_weight_ = LMHeadWeight(
+            dim=hidden_size,
+            vocab_size=vocab_size,
+            weight_name="output.weight",
+            data_type=self.data_type_,
+        )
         self.final_norm_weight_ = RMSNormWeight(
             dim=hidden_size,
             weight_name="model.norm.weight",
diff --git a/lightllm/models/internlm2_reward/layer_weights/pre_and_post_layer_weight.py b/lightllm/models/internlm2_reward/layer_weights/pre_and_post_layer_weight.py
index caef47399..b52619212 100644
--- a/lightllm/models/internlm2_reward/layer_weights/pre_and_post_layer_weight.py
+++ b/lightllm/models/internlm2_reward/layer_weights/pre_and_post_layer_weight.py
@@ -1,4 +1,3 @@
-import numpy as np
 from lightllm.common.basemodel import PreAndPostLayerWeight
 from lightllm.common.basemodel.layer_weights.meta_weights import EmbeddingWeight, RMSNormWeight, ROWMMWeight
 
@@ -7,7 +6,10 @@ class Internlm2RewardPreAndPostLayerWeight(PreAndPostLayerWeight):
     def __init__(self, data_type, network_config):
         super().__init__(data_type, network_config)
         hidden_size = network_config["hidden_size"]
+        vocab_size = network_config["vocab_size"]
         self.wte_weight_ = EmbeddingWeight(
+            dim=hidden_size,
+            vocab_size=vocab_size,
             weight_name="model.tok_embeddings.weight",
             data_type=self.data_type_,
         )
diff --git a/lightllm/models/llama/layer_infer/post_layer_infer.py b/lightllm/models/llama/layer_infer/post_layer_infer.py
index 8bc10d623..771416415 100644
--- a/lightllm/models/llama/layer_infer/post_layer_infer.py
+++ b/lightllm/models/llama/layer_infer/post_layer_infer.py
@@ -19,7 +19,7 @@ def __init__(self, network_config):
         return
 
     def _norm(self, input, infer_state, layer_weight: LlamaPreAndPostLayerWeight) -> torch.Tensor:
-        return layer_weight.final_norm_weight_.rmsnorm_forward(input=input, eps=self.eps_, alloc_func=self.alloc_tensor)
+        return layer_weight.final_norm_weight_(input=input, eps=self.eps_, alloc_func=self.alloc_tensor)
 
     def _slice_get_last_input(self, input_embdings: torch.Tensor, infer_state: LlamaInferStateInfo):
         embed_dim_ = input_embdings.shape[1]
@@ -66,7 +66,7 @@ def token_forward(
         input_embdings = None
         last_input = self._norm(last_input, infer_state, layer_weight)
         last_input = last_input.permute(1, 0).view(-1, token_num)
-        logic_batch = layer_weight.lm_head_weight_.lm_head(input=last_input, alloc_func=self.alloc_tensor)
+        logic_batch = layer_weight.lm_head_weight_(input=last_input, alloc_func=self.alloc_tensor)
         last_input = None
         vocab_size = layer_weight.lm_head_weight_.vocab_size
         if self.tp_world_size_ == 1:
diff --git a/lightllm/models/llama/layer_infer/pre_layer_infer.py b/lightllm/models/llama/layer_infer/pre_layer_infer.py
index f4f150b17..63a2fe4d1 100644
--- a/lightllm/models/llama/layer_infer/pre_layer_infer.py
+++ b/lightllm/models/llama/layer_infer/pre_layer_infer.py
@@ -15,13 +15,13 @@ def __init__(self, network_config):
         return
 
     def context_forward(self, input_ids, infer_state: LlamaInferStateInfo, layer_weight: LlamaPreAndPostLayerWeight):
-        input_embdings = layer_weight.wte_weight_.embedding(input_ids=input_ids, alloc_func=self.alloc_tensor)
+        input_embdings = layer_weight.wte_weight_(input_ids=input_ids, alloc_func=self.alloc_tensor)
         if self.tp_world_size_ > 1:
             all_reduce(input_embdings, op=dist.ReduceOp.SUM, group=infer_state.dist_group, async_op=False)
         return input_embdings
 
     def token_forward(self, input_ids, infer_state: LlamaInferStateInfo, layer_weight: LlamaPreAndPostLayerWeight):
-        input_embdings = layer_weight.wte_weight_.embedding(input_ids=input_ids, alloc_func=self.alloc_tensor)
+        input_embdings = layer_weight.wte_weight_(input_ids=input_ids, alloc_func=self.alloc_tensor)
         if self.tp_world_size_ > 1:
             all_reduce(input_embdings, op=dist.ReduceOp.SUM, group=infer_state.dist_group, async_op=False)
         return input_embdings
diff --git a/lightllm/models/llama/layer_infer/transformer_layer_infer.py b/lightllm/models/llama/layer_infer/transformer_layer_infer.py
index 2a9a54319..820c5efa0 100644
--- a/lightllm/models/llama/layer_infer/transformer_layer_infer.py
+++ b/lightllm/models/llama/layer_infer/transformer_layer_infer.py
@@ -69,16 +69,12 @@ def _token_attention_kernel(
     def _att_norm(
         self, input, infer_state: LlamaInferStateInfo, layer_weight: LlamaTransformerLayerWeight
     ) -> torch.Tensor:
-        return layer_weight.att_norm_weight_.rmsnorm_forward(input=input, eps=self.eps_, alloc_func=self.alloc_tensor)
+        return layer_weight.att_norm_weight_(input=input, eps=self.eps_, alloc_func=self.alloc_tensor)
 
     def _ffn_norm(
         self, input, infer_state: LlamaInferStateInfo, layer_weight: LlamaTransformerLayerWeight
     ) -> torch.Tensor:
-        return layer_weight.ffn_norm_weight_.rmsnorm_forward(
-            input=input,
-            eps=self.eps_,
-            alloc_func=self.alloc_tensor,
-        )
+        return layer_weight.ffn_norm_weight_(input=input, eps=self.eps_, alloc_func=self.alloc_tensor)
 
     def _get_qkv(
         self, input, infer_state: LlamaInferStateInfo, layer_weight: LlamaTransformerLayerWeight
diff --git a/lightllm/models/llama/layer_weights/pre_and_post_layer_weight.py b/lightllm/models/llama/layer_weights/pre_and_post_layer_weight.py
index 82c1f3aa2..d240e9ab5 100644
--- a/lightllm/models/llama/layer_weights/pre_and_post_layer_weight.py
+++ b/lightllm/models/llama/layer_weights/pre_and_post_layer_weight.py
@@ -7,18 +7,20 @@ def __init__(self, data_type, network_config):
         super().__init__(data_type, network_config)
 
         hidden_size = network_config["hidden_size"]
+        vocab_size = network_config["vocab_size"]
         self.wte_weight_ = EmbeddingWeight(
+            dim=hidden_size,
+            vocab_size=vocab_size,
             weight_name="model.embed_tokens.weight",
             data_type=self.data_type_,
         )
         tie_word_embeddings = self.network_config_.get("tie_word_embeddings", False)
-        if tie_word_embeddings:
-            self.lm_head_weight_: LMHeadWeight = self.wte_weight_
-        else:
-            self.lm_head_weight_ = LMHeadWeight(
-                weight_name="lm_head.weight",
-                data_type=self.data_type_,
-            )
+        self.lm_head_weight_ = LMHeadWeight(
+            dim=hidden_size,
+            vocab_size=vocab_size,
+            weight_name="model.embed_tokens.weight" if tie_word_embeddings else "lm_head.weight",
+            data_type=self.data_type_,
+        )
 
         self.final_norm_weight_ = RMSNormWeight(
             dim=hidden_size,
diff --git a/lightllm/models/mistral_mtp/layer_infer/pre_layer_infer.py b/lightllm/models/mistral_mtp/layer_infer/pre_layer_infer.py
index dbe9b61c8..96a15d18a 100644
--- a/lightllm/models/mistral_mtp/layer_infer/pre_layer_infer.py
+++ b/lightllm/models/mistral_mtp/layer_infer/pre_layer_infer.py
@@ -19,22 +19,10 @@ def _mtp_context_forward(
             input_embdings.shape[0] == tgt_embdings.shape[0]
         ), f"shape {input_embdings.shape} != shape {tgt_embdings.shape}"
 
-        layer_weight.enorm_weight_.rmsnorm_forward(
-            input=input_embdings,
-            eps=self.eps_,
-            out=input_embdings,
-        )
+        layer_weight.enorm_weight_(input=input_embdings, eps=self.eps_, out=input_embdings)
 
-        tgt_embdings = layer_weight.final_norm_weight_.rmsnorm_forward(
-            input=tgt_embdings,
-            eps=self.eps_,
-            alloc_func=self.alloc_tensor,
-        )
-        layer_weight.hnorm_weight_.rmsnorm_forward(
-            input=tgt_embdings,
-            eps=self.eps_,
-            out=tgt_embdings,
-        )
+        tgt_embdings = layer_weight.final_norm_weight_(input=tgt_embdings, eps=self.eps_, alloc_func=self.alloc_tensor)
+        layer_weight.hnorm_weight_(input=tgt_embdings, eps=self.eps_, out=tgt_embdings)
 
         cat_embdings = torch.cat((input_embdings, tgt_embdings), dim=-1)
 
@@ -47,22 +35,10 @@ def _mtp_token_forward(
         tgt_embdings = infer_state.mtp_draft_input_hiddens
         assert input_embdings.shape[0] == tgt_embdings.shape[0]
 
-        layer_weight.enorm_weight_.rmsnorm_forward(
-            input=input_embdings,
-            eps=self.eps_,
-            out=input_embdings,
-        )
+        layer_weight.enorm_weight_(input=input_embdings, eps=self.eps_, out=input_embdings)
 
-        tgt_embdings = layer_weight.final_norm_weight_.rmsnorm_forward(
-            input=tgt_embdings,
-            eps=self.eps_,
-            alloc_func=self.alloc_tensor,
-        )
-        layer_weight.hnorm_weight_.rmsnorm_forward(
-            input=tgt_embdings,
-            eps=self.eps_,
-            out=tgt_embdings,
-        )
+        tgt_embdings = layer_weight.final_norm_weight_(input=tgt_embdings, eps=self.eps_, alloc_func=self.alloc_tensor)
+        layer_weight.hnorm_weight_(input=tgt_embdings, eps=self.eps_, out=tgt_embdings)
 
         cat_embdings = torch.cat((input_embdings, tgt_embdings), dim=-1)
 
diff --git a/lightllm/models/qwen/layer_weights/pre_and_post_layer_weight.py b/lightllm/models/qwen/layer_weights/pre_and_post_layer_weight.py
index c35f5c78c..52d1a54f5 100644
--- a/lightllm/models/qwen/layer_weights/pre_and_post_layer_weight.py
+++ b/lightllm/models/qwen/layer_weights/pre_and_post_layer_weight.py
@@ -1,5 +1,3 @@
-import torch
-import numpy as np
 from lightllm.common.basemodel import PreAndPostLayerWeight
 from lightllm.common.basemodel.layer_weights.meta_weights import EmbeddingWeight, LMHeadWeight, RMSNormWeight
 
@@ -8,11 +6,16 @@ class QwenPreAndPostLayerWeight(PreAndPostLayerWeight):
     def __init__(self, data_type, network_config):
         super().__init__(data_type, network_config)
         hidden_size = network_config["hidden_size"]
+        vocab_size = network_config["vocab_size"]
         self.wte_weight_ = EmbeddingWeight(
+            dim=hidden_size,
+            vocab_size=vocab_size,
             weight_name="transformer.wte.weight",
             data_type=self.data_type_,
         )
         self.lm_head_weight_ = LMHeadWeight(
+            dim=hidden_size,
+            vocab_size=vocab_size,
             weight_name="lm_head.weight",
             data_type=self.data_type_,
         )
diff --git a/lightllm/models/qwen3_moe/layer_infer/transformer_layer_infer.py b/lightllm/models/qwen3_moe/layer_infer/transformer_layer_infer.py
index c85c423c2..5cd29dcdb 100644
--- a/lightllm/models/qwen3_moe/layer_infer/transformer_layer_infer.py
+++ b/lightllm/models/qwen3_moe/layer_infer/transformer_layer_infer.py
@@ -62,13 +62,9 @@ def _get_qkv(
         q = layer_weight.q_proj.mm(input)
         cache_kv = layer_weight.kv_proj.mm(input).view(-1, (self.tp_k_head_num_ + self.tp_v_head_num_), self.head_dim_)
 
-        layer_weight.q_norm_weight_.rmsnorm_forward(
-            q.view(-1, self.head_dim_),
-            eps=self.eps_,
-            out=q.view(-1, self.head_dim_),
-        )
+        layer_weight.q_norm_weight_(q.view(-1, self.head_dim_), eps=self.eps_, out=q.view(-1, self.head_dim_))
 
-        cache_kv[:, : self.tp_k_head_num_, :] = layer_weight.k_norm_weight_.rmsnorm_forward(
+        cache_kv[:, : self.tp_k_head_num_, :] = layer_weight.k_norm_weight_(
             input=cache_kv[:, : self.tp_k_head_num_, :].reshape(-1, cache_kv.shape[-1]),
             eps=self.eps_,
             alloc_func=self.alloc_tensor,
@@ -100,13 +96,9 @@ def _tpsp_get_qkv(
         q = layer_weight.q_proj.mm(input)
         cache_kv = layer_weight.kv_proj.mm(input).view(-1, (self.tp_k_head_num_ + self.tp_v_head_num_), self.head_dim_)
 
-        layer_weight.q_norm_weight_.rmsnorm_forward(
-            q.view(-1, self.head_dim_),
-            eps=self.eps_,
-            out=q.view(-1, self.head_dim_),
-        )
+        layer_weight.q_norm_weight_(q.view(-1, self.head_dim_), eps=self.eps_, out=q.view(-1, self.head_dim_))
 
-        cache_kv[:, : self.tp_k_head_num_, :] = layer_weight.k_norm_weight_.rmsnorm_forward(
+        cache_kv[:, : self.tp_k_head_num_, :] = layer_weight.k_norm_weight_(
             cache_kv[:, : self.tp_k_head_num_, :].reshape(-1, cache_kv.shape[-1]),
             eps=self.eps_,
             alloc_func=self.alloc_tensor,
diff --git a/lightllm/models/qwen3_vl_moe/layer_weights/pre_and_post_layer_weight.py b/lightllm/models/qwen3_vl_moe/layer_weights/pre_and_post_layer_weight.py
index 43758731b..b6c7c50a0 100644
--- a/lightllm/models/qwen3_vl_moe/layer_weights/pre_and_post_layer_weight.py
+++ b/lightllm/models/qwen3_vl_moe/layer_weights/pre_and_post_layer_weight.py
@@ -1,4 +1,3 @@
-import numpy as np
 from lightllm.common.basemodel import PreAndPostLayerWeight
 from lightllm.common.basemodel.layer_weights.meta_weights import EmbeddingWeight, LMHeadWeight, RMSNormWeight
 
@@ -7,18 +6,20 @@ class Qwen3VLMOEPreAndPostLayerWeight(PreAndPostLayerWeight):
     def __init__(self, data_type, network_config):
         super().__init__(data_type, network_config)
         hidden_size = network_config["hidden_size"]
+        vocab_size = network_config["vocab_size"]
         self.wte_weight_ = EmbeddingWeight(
+            dim=hidden_size,
+            vocab_size=vocab_size,
             weight_name="model.language_model.embed_tokens.weight",
             data_type=self.data_type_,
         )
         tie_word_embeddings = self.network_config_.get("tie_word_embeddings", False)
-        if tie_word_embeddings:
-            self.lm_head_weight_: LMHeadWeight = self.wte_weight_
-        else:
-            self.lm_head_weight_ = LMHeadWeight(
-                weight_name="lm_head.weight",
-                data_type=self.data_type_,
-            )
+        self.lm_head_weight_ = LMHeadWeight(
+            dim=hidden_size,
+            vocab_size=vocab_size,
+            weight_name="model.language_model.embed_tokens.weight" if tie_word_embeddings else "lm_head.weight",
+            data_type=self.data_type_,
+        )
         self.final_norm_weight_ = RMSNormWeight(
             dim=hidden_size,
             weight_name="model.language_model.norm.weight",
diff --git a/lightllm/models/stablelm/layer_infer/transformer_layer_infer.py b/lightllm/models/stablelm/layer_infer/transformer_layer_infer.py
index f908dbdd3..55848ce66 100755
--- a/lightllm/models/stablelm/layer_infer/transformer_layer_infer.py
+++ b/lightllm/models/stablelm/layer_infer/transformer_layer_infer.py
@@ -53,17 +53,13 @@ def _tpsp_get_o(self, input, infer_state, layer_weight) -> Tuple[torch.Tensor, t
     def _att_norm(
         self, input, infer_state: LlamaInferStateInfo, layer_weight: StablelmTransformerLayerWeight
     ) -> torch.Tensor:
-        return layer_weight.att_norm_weight_.layernorm_forward(
-            input=input.view(-1, self.embed_dim_),
-            eps=self.eps_,
-            alloc_func=self.alloc_tensor,
+        return layer_weight.att_norm_weight_(
+            input=input.view(-1, self.embed_dim_), eps=self.eps_, alloc_func=self.alloc_tensor
         )
 
     def _ffn_norm(
         self, input, infer_state: LlamaInferStateInfo, layer_weight: StablelmTransformerLayerWeight
     ) -> torch.Tensor:
-        return layer_weight.ffn_norm_weight_.layernorm_forward(
-            input=input.view(-1, self.embed_dim_),
-            eps=self.eps_,
-            alloc_func=self.alloc_tensor,
+        return layer_weight.ffn_norm_weight_(
+            input=input.view(-1, self.embed_dim_), eps=self.eps_, alloc_func=self.alloc_tensor
         )
diff --git a/lightllm/models/starcoder/layer_infer/pre_layer_infer.py b/lightllm/models/starcoder/layer_infer/pre_layer_infer.py
index 6b88c066e..b3cd083c3 100644
--- a/lightllm/models/starcoder/layer_infer/pre_layer_infer.py
+++ b/lightllm/models/starcoder/layer_infer/pre_layer_infer.py
@@ -14,24 +14,18 @@ def __init__(self, network_config):
         self.layer_norm_eps_ = network_config["layer_norm_epsilon"]
 
     def context_forward(self, input_ids, infer_state: InferStateInfo, layer_weight: StarcoderPreAndPostLayerWeight):
-        input_embdings = layer_weight.wte_weight_.embedding(input_ids=input_ids, alloc_func=self.alloc_tensor)
+        input_embdings = layer_weight.wte_weight_(input_ids=input_ids, alloc_func=self.alloc_tensor)
         if self.tp_world_size_ > 1:
             all_reduce(input_embdings, group=infer_state.dist_group, op=dist.ReduceOp.SUM, async_op=False)
 
-        position_embeds = layer_weight.wpe_weight_.embedding(
-            input_ids=infer_state.position_ids,
-            alloc_func=self.alloc_tensor,
-        )
+        position_embeds = layer_weight.wpe_weight_(input_ids=infer_state.position_ids, alloc_func=self.alloc_tensor)
 
         return input_embdings.add_(position_embeds)
 
     def token_forward(self, input_ids, infer_state: InferStateInfo, layer_weight: StarcoderPreAndPostLayerWeight):
-        input_embdings = layer_weight.wte_weight_.embedding(input_ids=input_ids, alloc_func=self.alloc_tensor)
+        input_embdings = layer_weight.wte_weight_(input_ids=input_ids, alloc_func=self.alloc_tensor)
         if self.tp_world_size_ > 1:
             all_reduce(input_embdings, group=infer_state.dist_group, op=dist.ReduceOp.SUM, async_op=False)
 
-        position_embeds = layer_weight.wpe_weight_.embedding(
-            input_ids=infer_state.position_ids,
-            alloc_func=self.alloc_tensor,
-        )
+        position_embeds = layer_weight.wpe_weight_(input_ids=infer_state.position_ids, alloc_func=self.alloc_tensor)
         return input_embdings.add_(position_embeds)
diff --git a/lightllm/models/starcoder/layer_weights/pre_and_post_layer_weight.py b/lightllm/models/starcoder/layer_weights/pre_and_post_layer_weight.py
index 939c6a146..a258480f6 100644
--- a/lightllm/models/starcoder/layer_weights/pre_and_post_layer_weight.py
+++ b/lightllm/models/starcoder/layer_weights/pre_and_post_layer_weight.py
@@ -13,11 +13,17 @@ def __init__(self, data_type, network_config):
 
     def _create_weight(self):
         hidden_size = self.network_config["hidden_size"]
+        vocab_size = self.network_config["vocab_size"]
+        max_position_embeddings = self.network_config["max_position_embeddings"]
         self.wte_weight_ = EmbeddingWeight(
+            dim=hidden_size,
+            vocab_size=vocab_size,
             weight_name="transformer.wte.weight",
             data_type=self.data_type_,
         )
         self.wpe_weight_ = NoTpPosEmbeddingWeight(
+            dim=hidden_size,
+            max_position_embeddings=max_position_embeddings,
             weight_name="transformer.wpe.weight",
             data_type=self.data_type_,
         )
@@ -29,6 +35,8 @@ def _create_weight(self):
             data_type=self.data_type_,
         )
         self.lm_head_weight_ = LMHeadWeight(
+            dim=hidden_size,
+            vocab_size=vocab_size,
             weight_name="lm_head.weight",
             data_type=self.data_type_,
         )
diff --git a/lightllm/models/starcoder2/layer_infer/transformer_layer_infer.py b/lightllm/models/starcoder2/layer_infer/transformer_layer_infer.py
index 09e3299eb..3e32682ec 100644
--- a/lightllm/models/starcoder2/layer_infer/transformer_layer_infer.py
+++ b/lightllm/models/starcoder2/layer_infer/transformer_layer_infer.py
@@ -11,19 +11,15 @@ def __init__(self, layer_num, network_config):
     def _att_norm(
         self, input, infer_state: LlamaInferStateInfo, layer_weight: Starcoder2TransformerLayerWeight
     ) -> torch.Tensor:
-        return layer_weight.att_norm_weight_.layernorm_forward(
-            input=input.view(-1, self.embed_dim_),
-            eps=self.eps_,
-            alloc_func=self.alloc_tensor,
+        return layer_weight.att_norm_weight_(
+            input=input.view(-1, self.embed_dim_), eps=self.eps_, alloc_func=self.alloc_tensor
         )
 
     def _ffn_norm(
         self, input, infer_state: LlamaInferStateInfo, layer_weight: Starcoder2TransformerLayerWeight
     ) -> torch.Tensor:
-        return layer_weight.ffn_norm_weight_.layernorm_forward(
-            input=input.view(-1, self.embed_dim_),
-            eps=self.eps_,
-            alloc_func=self.alloc_tensor,
+        return layer_weight.ffn_norm_weight_(
+            input=input.view(-1, self.embed_dim_), eps=self.eps_, alloc_func=self.alloc_tensor
         )
 
     def _ffn(
diff --git a/lightllm/models/starcoder2/layer_weights/pre_and_post_layer_weight.py b/lightllm/models/starcoder2/layer_weights/pre_and_post_layer_weight.py
index 7890f82dc..c5ea7d922 100644
--- a/lightllm/models/starcoder2/layer_weights/pre_and_post_layer_weight.py
+++ b/lightllm/models/starcoder2/layer_weights/pre_and_post_layer_weight.py
@@ -1,5 +1,3 @@
-import torch
-import numpy as np
 from lightllm.common.basemodel import PreAndPostLayerWeight
 from lightllm.common.basemodel.layer_weights.meta_weights import EmbeddingWeight, LMHeadWeight, LayerNormWeight
 
@@ -8,18 +6,20 @@ class Starcoder2PreAndPostLayerWeight(PreAndPostLayerWeight):
     def __init__(self, data_type, network_config):
         super().__init__(data_type, network_config)
         hidden_size = network_config["hidden_size"]
+        vocab_size = network_config["vocab_size"]
         self.wte_weight_ = EmbeddingWeight(
+            dim=hidden_size,
+            vocab_size=vocab_size,
             weight_name="model.embed_tokens.weight",
             data_type=self.data_type_,
         )
         tie_word_embeddings = self.network_config_.get("tie_word_embeddings", False)
-        if tie_word_embeddings:
-            self.lm_head_weight_: LMHeadWeight = self.wte_weight_
-        else:
-            self.lm_head_weight_ = LMHeadWeight(
-                weight_name="lm_head.weight",
-                data_type=self.data_type_,
-            )
+        self.lm_head_weight_ = LMHeadWeight(
+            dim=hidden_size,
+            vocab_size=vocab_size,
+            weight_name="model.embed_tokens.weight" if tie_word_embeddings else "lm_head.weight",
+            data_type=self.data_type_,
+        )
 
         self.final_norm_weight_ = LayerNormWeight(
             dim=hidden_size,

From 4c2b33f1125beff46e71bf0bef515739f606f57f Mon Sep 17 00:00:00 2001
From: sufubao <sufubao@sensetime.com>
Date: Mon, 12 Jan 2026 08:47:26 +0000
Subject: [PATCH 06/65] fix LMHeadWeight

---
 .../meta_weights/embedding_weight.py          | 26 ++++++++++++++++---
 .../pre_and_post_layer_weight.py              | 22 +++++++++++-----
 .../pre_and_post_layer_weight.py              | 22 +++++++++++-----
 .../pre_and_post_layer_weight.py              | 22 +++++++++++-----
 4 files changed, 70 insertions(+), 22 deletions(-)

diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/embedding_weight.py b/lightllm/common/basemodel/layer_weights/meta_weights/embedding_weight.py
index d1de857cd..e228d5c86 100644
--- a/lightllm/common/basemodel/layer_weights/meta_weights/embedding_weight.py
+++ b/lightllm/common/basemodel/layer_weights/meta_weights/embedding_weight.py
@@ -78,7 +78,14 @@ def __call__(
 
 
 class LMHeadWeight(BaseWeightTpl, PlatformAwareOp):
-    def __init__(self, dim: int, vocab_size: int, weight_name: str, data_type: torch.dtype):
+    def __init__(
+        self,
+        dim: int,
+        vocab_size: int,
+        weight_name: str,
+        data_type: torch.dtype,
+        shared_weight: Optional[EmbeddingWeight] = None,
+    ):
         super().__init__()
         self.dim = dim
         self.vocab_size = vocab_size
@@ -90,13 +97,24 @@ def __init__(self, dim: int, vocab_size: int, weight_name: str, data_type: torch
         self.tp_vocab_end_id = int(split_indexes[self.tp_rank_ + 1])
         self.weight_name: str = weight_name
         self.data_type_ = data_type
-        self._create_weight()
+        self._shared_weight = shared_weight
+        if shared_weight is None:
+            self._create_weight()
+
+    @property
+    def weight(self) -> torch.Tensor:
+        if self._shared_weight is not None:
+            return self._shared_weight.weight
+        return self._weight
 
     def _create_weight(self):
         tp_vocab_size = self.tp_vocab_end_id - self.tp_vocab_start_id
-        self.weight: torch.Tensor = torch.empty(tp_vocab_size, self.dim, dtype=self.data_type_, device=self.device_id_)
+        self._weight: torch.Tensor = torch.empty(tp_vocab_size, self.dim, dtype=self.data_type_, device=self.device_id_)
 
     def load_hf_weights(self, weights: Dict[str, torch.Tensor]):
+        # When using shared weight, no need to load - EmbeddingWeight already loaded it
+        if self._shared_weight is not None:
+            return
         if self.weight_name not in weights:
             return
         t_weight = weights[self.weight_name]
@@ -105,7 +123,7 @@ def load_hf_weights(self, weights: Dict[str, torch.Tensor]):
             loaded_vocab_size == self.vocab_size
         ), f"loaded weight vocab_size: {loaded_vocab_size} != expected vocab_size: {self.vocab_size}"
         logger.info(f"loaded weight vocab_size: {self.vocab_size}")
-        self.weight.copy_(t_weight[self.tp_vocab_start_id : self.tp_vocab_end_id, :].to(self.data_type_))
+        self._weight.copy_(t_weight[self.tp_vocab_start_id : self.tp_vocab_end_id, :].to(self.data_type_))
 
     def _native_forward(
         self, input: torch.Tensor, out: Optional[torch.Tensor] = None, _alloc_func=torch.empty
diff --git a/lightllm/models/llama/layer_weights/pre_and_post_layer_weight.py b/lightllm/models/llama/layer_weights/pre_and_post_layer_weight.py
index d240e9ab5..2e14eca26 100644
--- a/lightllm/models/llama/layer_weights/pre_and_post_layer_weight.py
+++ b/lightllm/models/llama/layer_weights/pre_and_post_layer_weight.py
@@ -15,12 +15,22 @@ def __init__(self, data_type, network_config):
             data_type=self.data_type_,
         )
         tie_word_embeddings = self.network_config_.get("tie_word_embeddings", False)
-        self.lm_head_weight_ = LMHeadWeight(
-            dim=hidden_size,
-            vocab_size=vocab_size,
-            weight_name="model.embed_tokens.weight" if tie_word_embeddings else "lm_head.weight",
-            data_type=self.data_type_,
-        )
+        if tie_word_embeddings:
+            # Share weight with EmbeddingWeight to save memory
+            self.lm_head_weight_ = LMHeadWeight(
+                dim=hidden_size,
+                vocab_size=vocab_size,
+                weight_name="model.embed_tokens.weight",
+                data_type=self.data_type_,
+                shared_weight=self.wte_weight_,
+            )
+        else:
+            self.lm_head_weight_ = LMHeadWeight(
+                dim=hidden_size,
+                vocab_size=vocab_size,
+                weight_name="lm_head.weight",
+                data_type=self.data_type_,
+            )
 
         self.final_norm_weight_ = RMSNormWeight(
             dim=hidden_size,
diff --git a/lightllm/models/qwen3_vl_moe/layer_weights/pre_and_post_layer_weight.py b/lightllm/models/qwen3_vl_moe/layer_weights/pre_and_post_layer_weight.py
index b6c7c50a0..475bcee95 100644
--- a/lightllm/models/qwen3_vl_moe/layer_weights/pre_and_post_layer_weight.py
+++ b/lightllm/models/qwen3_vl_moe/layer_weights/pre_and_post_layer_weight.py
@@ -14,12 +14,22 @@ def __init__(self, data_type, network_config):
             data_type=self.data_type_,
         )
         tie_word_embeddings = self.network_config_.get("tie_word_embeddings", False)
-        self.lm_head_weight_ = LMHeadWeight(
-            dim=hidden_size,
-            vocab_size=vocab_size,
-            weight_name="model.language_model.embed_tokens.weight" if tie_word_embeddings else "lm_head.weight",
-            data_type=self.data_type_,
-        )
+        if tie_word_embeddings:
+            # Share weight with EmbeddingWeight to save memory
+            self.lm_head_weight_ = LMHeadWeight(
+                dim=hidden_size,
+                vocab_size=vocab_size,
+                weight_name="model.language_model.embed_tokens.weight",
+                data_type=self.data_type_,
+                shared_weight=self.wte_weight_,
+            )
+        else:
+            self.lm_head_weight_ = LMHeadWeight(
+                dim=hidden_size,
+                vocab_size=vocab_size,
+                weight_name="lm_head.weight",
+                data_type=self.data_type_,
+            )
         self.final_norm_weight_ = RMSNormWeight(
             dim=hidden_size,
             weight_name="model.language_model.norm.weight",
diff --git a/lightllm/models/starcoder2/layer_weights/pre_and_post_layer_weight.py b/lightllm/models/starcoder2/layer_weights/pre_and_post_layer_weight.py
index c5ea7d922..e6d5cb441 100644
--- a/lightllm/models/starcoder2/layer_weights/pre_and_post_layer_weight.py
+++ b/lightllm/models/starcoder2/layer_weights/pre_and_post_layer_weight.py
@@ -14,12 +14,22 @@ def __init__(self, data_type, network_config):
             data_type=self.data_type_,
         )
         tie_word_embeddings = self.network_config_.get("tie_word_embeddings", False)
-        self.lm_head_weight_ = LMHeadWeight(
-            dim=hidden_size,
-            vocab_size=vocab_size,
-            weight_name="model.embed_tokens.weight" if tie_word_embeddings else "lm_head.weight",
-            data_type=self.data_type_,
-        )
+        if tie_word_embeddings:
+            # Share weight with EmbeddingWeight to save memory
+            self.lm_head_weight_ = LMHeadWeight(
+                dim=hidden_size,
+                vocab_size=vocab_size,
+                weight_name="model.embed_tokens.weight",
+                data_type=self.data_type_,
+                shared_weight=self.wte_weight_,
+            )
+        else:
+            self.lm_head_weight_ = LMHeadWeight(
+                dim=hidden_size,
+                vocab_size=vocab_size,
+                weight_name="lm_head.weight",
+                data_type=self.data_type_,
+            )
 
         self.final_norm_weight_ = LayerNormWeight(
             dim=hidden_size,

From c901ce9c86cd65c7f881bc4fd076dea608d03978 Mon Sep 17 00:00:00 2001
From: shihaobai <1798930569@qq.com>
Date: Mon, 12 Jan 2026 11:25:25 +0000
Subject: [PATCH 07/65] fix gemma norm & slicer

---
 .../layer_weights/meta_weights/__init__.py    |  2 +-
 .../layer_weights/meta_weights/base_weight.py |  2 ++
 .../meta_weights/mm_weight/mm_slicer.py       |  2 +-
 .../meta_weights/mm_weight/rowmm_weight.py    | 12 ++++----
 .../layer_weights/meta_weights/norm_weight.py | 24 +++++++++------
 lightllm/common/quantization/no_quant.py      |  2 +-
 .../layer_weights/transformer_layer_weight.py | 29 +++++++++----------
 .../pre_and_post_layer_weight.py              |  1 +
 .../layer_weights/transformer_layer_weight.py | 19 +++++++++---
 .../layer_weights/transformer_layer_weight.py |  8 +++--
 .../layer_weights/transformer_layer_weight.py | 10 +++----
 .../layer_weights/transformer_layer_weight.py |  5 ++--
 12 files changed, 68 insertions(+), 48 deletions(-)

diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/__init__.py b/lightllm/common/basemodel/layer_weights/meta_weights/__init__.py
index cbf399843..47bf7c05f 100644
--- a/lightllm/common/basemodel/layer_weights/meta_weights/__init__.py
+++ b/lightllm/common/basemodel/layer_weights/meta_weights/__init__.py
@@ -5,7 +5,7 @@
     KVROWNMMWeight,
     COLMMWeight,
 )
-from .norm_weight import TpRMSNormWeight, RMSNormWeight, LayerNormWeight
+from .norm_weight import TpRMSNormWeight, RMSNormWeight, LayerNormWeight, NoTpGEMMANormWeight
 from .embedding_weight import EmbeddingWeight, LMHeadWeight, NoTpPosEmbeddingWeight
 from .att_sink_weight import TpAttSinkWeight
 from .fused_moe.fused_moe_weight_tp import create_tp_moe_wegiht_obj
diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/base_weight.py b/lightllm/common/basemodel/layer_weights/meta_weights/base_weight.py
index b17da6682..58860ab30 100644
--- a/lightllm/common/basemodel/layer_weights/meta_weights/base_weight.py
+++ b/lightllm/common/basemodel/layer_weights/meta_weights/base_weight.py
@@ -6,6 +6,7 @@
 
 class BaseWeight(ABC):
     def __init__(self):
+        super().__init__()
         pass
 
     @abstractmethod
@@ -19,6 +20,7 @@ def _create_weight(self):
 
 class BaseWeightTpl(BaseWeight):
     def __init__(self, tp_rank: int = None, tp_world_size: int = None, data_type: torch.dtype = None):
+        super().__init__()
         self.tp_world_size_ = tp_world_size if tp_world_size is not None else get_dp_world_size()
         self.tp_rank_ = tp_rank if tp_rank is not None else get_current_rank_in_dp()
         self.device_id_ = get_current_device_id()
diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/mm_slicer.py b/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/mm_slicer.py
index 4bc3b44a8..ddbf98a86 100644
--- a/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/mm_slicer.py
+++ b/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/mm_slicer.py
@@ -24,7 +24,7 @@ def _slice_bias(self, bias):
 
     def _get_slice_start_end(self, size: int) -> Tuple[int, int]:
         tp_size = size * self.repeat_times_ // self.tp_world_size_
-        start = tp_size * (self.tp_rank_ % self.repeat_times_)
+        start = tp_size * (self.tp_rank_ // self.repeat_times_)
         end = start + tp_size
         return start, end
 
diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/rowmm_weight.py b/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/rowmm_weight.py
index e73b0cecb..d7554b375 100644
--- a/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/rowmm_weight.py
+++ b/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/rowmm_weight.py
@@ -53,8 +53,8 @@ def __init__(
         tp_rank: int = None,
         tp_world_size: int = None,
     ) -> None:
-        self.tp_rank = tp_rank if tp_rank is not None else get_current_rank_in_dp()
-        self.tp_world_size = tp_world_size if tp_world_size is not None else get_dp_world_size()
+        self.tp_rank_ = tp_rank if tp_rank is not None else get_current_rank_in_dp()
+        self.tp_world_size_ = tp_world_size if tp_world_size is not None else get_dp_world_size()
         self.repeat_times = 1
         assert kv_head_num % self.tp_world_size_ == 0 or self.tp_world_size_ % kv_head_num == 0, (
             f"kv_head_num must be divisible by tp_world_size_ or "
@@ -70,13 +70,13 @@ def __init__(
             data_type=data_type,
             bias_names=bias_names,
             quant_method=quant_method,
-            tp_rank=self.tp_rank,
-            tp_world_size=self.tp_world_size,
+            tp_rank=self.tp_rank_,
+            tp_world_size=self.tp_world_size_,
         )
         self.param_slicer = get_row_slice_mixin(
             self.quant_method.method_name,
-            tp_rank=self.tp_rank,
-            tp_world_size=self.tp_world_size,
+            tp_rank=self.tp_rank_,
+            tp_world_size=self.tp_world_size_,
             repeat_times=self.repeat_times,
         )
 
diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/norm_weight.py b/lightllm/common/basemodel/layer_weights/meta_weights/norm_weight.py
index 16a2e53da..023201610 100644
--- a/lightllm/common/basemodel/layer_weights/meta_weights/norm_weight.py
+++ b/lightllm/common/basemodel/layer_weights/meta_weights/norm_weight.py
@@ -20,9 +20,7 @@ def __init__(self, dim: int, weight_name: str, data_type: torch.dtype, bias_name
         self._create_weight()
 
     def _create_weight(self):
-        self.weight: torch.Tensor = torch.nn.Parameter(
-            torch.empty(self.dim, dtype=self.data_type_, device=self.device_id_)
-        )
+        self.weight: torch.Tensor = torch.empty(self.dim, dtype=self.data_type_, device=self.device_id_)
 
     def load_hf_weights(self, weights: Dict[str, torch.Tensor]):
         if self.weight_name in weights:
@@ -67,12 +65,8 @@ def __init__(self, dim: int, weight_name: str, data_type: torch.dtype, bias_name
         self._create_weight()
 
     def _create_weight(self):
-        self.weight: torch.Tensor = torch.nn.Parameter(
-            torch.empty(self.dim, dtype=self.data_type_, device=self.device_id_)
-        )
-        self.bias: torch.Tensor = torch.nn.Parameter(
-            torch.empty(self.dim, dtype=self.data_type_, device=self.device_id_)
-        )
+        self.weight: torch.Tensor = torch.empty(self.dim, dtype=self.data_type_, device=self.device_id_)
+        self.bias: torch.Tensor = torch.empty(self.dim, dtype=self.data_type_, device=self.device_id_)
 
     def load_hf_weights(self, weights: Dict[str, torch.Tensor]):
         if self.weight_name in weights:
@@ -146,3 +140,15 @@ def load_hf_weights(self, weights):
             self.weight[:, end - start].copy_(t_weight[start:end].to(self.data_type_))
             # the padding part is zero
             self.weight[:, end:].zero_()
+
+
+class NoTpGEMMANormWeight(RMSNormWeight):
+    def __init__(self, dim: int, weight_name: str, data_type: torch.dtype, bias_name: str = None):
+        super().__init__(dim=dim, weight_name=weight_name, data_type=data_type, bias_name=bias_name)
+        self.tp_world_size_ = 1
+        self.tp_rank_ = 0
+
+    def load_hf_weights(self, weights: Dict[str, torch.Tensor]):
+        if self.weight_name in weights:
+            self.weight.copy_(weights[self.weight_name])
+        self.weight += 1
diff --git a/lightllm/common/quantization/no_quant.py b/lightllm/common/quantization/no_quant.py
index f342607c1..987601c5d 100644
--- a/lightllm/common/quantization/no_quant.py
+++ b/lightllm/common/quantization/no_quant.py
@@ -23,7 +23,7 @@ def apply(
             dtype = input_tensor.dtype
             device = input_tensor.device
             if use_custom_tensor_mananger:
-                out = g_cache_manager.alloc_tensor(shape, dtype, device=device, is_graph_out=False)
+                out = g_cache_manager.alloc_tensor(shape, dtype, device=device)
             else:
                 out = torch.empty(shape, dtype=dtype, device=device)
         if bias is None:
diff --git a/lightllm/models/deepseek2/layer_weights/transformer_layer_weight.py b/lightllm/models/deepseek2/layer_weights/transformer_layer_weight.py
index 65e00ebe7..6ff081eef 100644
--- a/lightllm/models/deepseek2/layer_weights/transformer_layer_weight.py
+++ b/lightllm/models/deepseek2/layer_weights/transformer_layer_weight.py
@@ -9,7 +9,6 @@
     COLMMWeight,
     RMSNormWeight,
     FusedMoeWeightEP,
-    ROWBMMWeight,
     create_tp_moe_wegiht_obj,
 )
 from functools import partial
@@ -176,20 +175,20 @@ def _init_qkvo(self):
                 layer_num=self.layer_num_,
                 name="q_b_proj",
             )
-        self.k_b_proj_ = ROWBMMWeight(
-            weight_names=f"model.layers.{self.layer_num_}.self_attn.k_b_proj.weight",
-            data_type=self.data_type_,
-            quant_cfg=None,
-            layer_num=self.layer_num_,
-            name="k_b_proj",
-        )
-        self.v_b_proj_ = ROWBMMWeight(
-            weight_names=f"model.layers.{self.layer_num_}.self_attn.v_b_proj.weight",
-            data_type=self.data_type_,
-            quant_cfg=None,
-            layer_num=self.layer_num_,
-            name="v_b_proj",
-        )
+        # self.k_b_proj_ = ROWBMMWeight(
+        #     weight_names=f"model.layers.{self.layer_num_}.self_attn.k_b_proj.weight",
+        #     data_type=self.data_type_,
+        #     quant_cfg=None,
+        #     layer_num=self.layer_num_,
+        #     name="k_b_proj",
+        # )
+        # self.v_b_proj_ = ROWBMMWeight(
+        #     weight_names=f"model.layers.{self.layer_num_}.self_attn.v_b_proj.weight",
+        #     data_type=self.data_type_,
+        #     quant_cfg=None,
+        #     layer_num=self.layer_num_,
+        #     name="v_b_proj",
+        # )
         if self.enable_cc_method:
             self.cc_kv_b_proj_ = ROWMMWeight(
                 weight_names=f"model.layers.{self.layer_num_}.self_attn.kv_b_proj.weight",
diff --git a/lightllm/models/gemma3/layer_weights/pre_and_post_layer_weight.py b/lightllm/models/gemma3/layer_weights/pre_and_post_layer_weight.py
index 336aa2fc3..7ae0fbcca 100644
--- a/lightllm/models/gemma3/layer_weights/pre_and_post_layer_weight.py
+++ b/lightllm/models/gemma3/layer_weights/pre_and_post_layer_weight.py
@@ -17,6 +17,7 @@ def __init__(self, data_type, network_config):
         self.lm_head_weight_ = self.wte_weight_
 
         self.final_norm_weight_ = NoTpGEMMANormWeight(
+            dim=hidden_size,
             weight_name="language_model.model.norm.weight",
             data_type=self.data_type_,
             bias_name=None,
diff --git a/lightllm/models/gemma3/layer_weights/transformer_layer_weight.py b/lightllm/models/gemma3/layer_weights/transformer_layer_weight.py
index e7808c412..11e3c2f36 100644
--- a/lightllm/models/gemma3/layer_weights/transformer_layer_weight.py
+++ b/lightllm/models/gemma3/layer_weights/transformer_layer_weight.py
@@ -63,13 +63,24 @@ def _init_qkv(self):
 
     def _init_norm(self):
         super()._init_norm()
-        self.k_norm_weight_ = NoTpGEMMANormWeight(self._k_norm_weight_name, self.data_type_, bias_name=None)
-        self.q_norm_weight_ = NoTpGEMMANormWeight(self._q_norm_weight_name, self.data_type_, bias_name=None)
+
+        self.k_norm_weight_ = NoTpGEMMANormWeight(
+            dim=self.head_dim_, weight_name=self._k_norm_weight_name, data_type=self.data_type_, bias_name=None
+        )
+        self.q_norm_weight_ = NoTpGEMMANormWeight(
+            dim=self.head_dim_, weight_name=self._q_norm_weight_name, data_type=self.data_type_, bias_name=None
+        )
         self.pre_feedforward_layernorm_weight_ = NoTpGEMMANormWeight(
-            self._pre_feedforward_layernorm_name, self.data_type_, bias_name=None
+            dim=self.n_embed,
+            weight_name=self._pre_feedforward_layernorm_name,
+            data_type=self.data_type_,
+            bias_name=None,
         )
         self.post_feedforward_layernorm_weight_ = NoTpGEMMANormWeight(
-            self._post_feedforward_layernorm_name, self.data_type_, bias_name=None
+            dim=self.n_embed,
+            weight_name=self._post_feedforward_layernorm_name,
+            data_type=self.data_type_,
+            bias_name=None,
         )
 
     def load_hf_weights(self, weights):
diff --git a/lightllm/models/gemma_2b/layer_weights/transformer_layer_weight.py b/lightllm/models/gemma_2b/layer_weights/transformer_layer_weight.py
index 9102ce677..87b2fb744 100644
--- a/lightllm/models/gemma_2b/layer_weights/transformer_layer_weight.py
+++ b/lightllm/models/gemma_2b/layer_weights/transformer_layer_weight.py
@@ -29,5 +29,9 @@ def _init_qkv(self):
         )
 
     def _init_norm(self):
-        self.att_norm_weight_ = NoTpGEMMANormWeight(self._att_norm_weight_name, self.data_type_)
-        self.ffn_norm_weight_ = NoTpGEMMANormWeight(self._ffn_norm_weight_name, self.data_type_)
+        self.att_norm_weight_ = NoTpGEMMANormWeight(
+            dim=self.n_embed, weight_name=self._att_norm_weight_name, data_type=self.data_type_, bias_name=None
+        )
+        self.ffn_norm_weight_ = NoTpGEMMANormWeight(
+            dim=self.n_embed, weight_name=self._ffn_norm_weight_name, data_type=self.data_type_, bias_name=None
+        )
diff --git a/lightllm/models/llama/layer_weights/transformer_layer_weight.py b/lightllm/models/llama/layer_weights/transformer_layer_weight.py
index 23ecbbabd..b68903ecd 100644
--- a/lightllm/models/llama/layer_weights/transformer_layer_weight.py
+++ b/lightllm/models/llama/layer_weights/transformer_layer_weight.py
@@ -93,19 +93,17 @@ def _init_o(self):
         )
 
     def _init_ffn(self):
-        in_dim = self.n_embed
-        out_dim = self.n_inter // self.tp_world_size_
         self.gate_up_proj = ROWMMWeight(
-            in_dim=in_dim,
-            out_dims=[out_dim, out_dim],
+            in_dim=self.n_embed,
+            out_dims=[self.n_inter, self.n_inter],
             weight_names=[self._gate_weight_name, self._up_weight_name],
             data_type=self.data_type_,
             bias_names=[self._gate_bias_name, self._up_bias_name],
             quant_method=self.get_quant_method("gate_up_proj"),
         )
         self.down_proj = COLMMWeight(
-            in_dim=out_dim,
-            out_dims=[in_dim],
+            in_dim=self.n_inter,
+            out_dims=[self.n_embed],
             weight_names=self._down_weight_name,
             data_type=self.data_type_,
             bias_names=self._down_bias_name,
diff --git a/lightllm/models/qwen3/layer_weights/transformer_layer_weight.py b/lightllm/models/qwen3/layer_weights/transformer_layer_weight.py
index cbf420f50..014f4f6ac 100644
--- a/lightllm/models/qwen3/layer_weights/transformer_layer_weight.py
+++ b/lightllm/models/qwen3/layer_weights/transformer_layer_weight.py
@@ -19,14 +19,13 @@ def _init_weight_names(self):
 
     def _init_norm(self):
         super()._init_norm()
-        hidden_size = self.network_config_["hidden_size"]
         self.q_norm_weight_ = RMSNormWeight(
-            dim=hidden_size,
+            dim=self.head_dim,
             weight_name=self._q_norm_name,
             data_type=self.data_type_,
         )
         self.k_norm_weight_ = RMSNormWeight(
-            dim=hidden_size,
+            dim=self.head_dim,
             weight_name=self._k_norm_name,
             data_type=self.data_type_,
         )

From 94bf9a09d5bacb43aacbb3af6264354249774c9e Mon Sep 17 00:00:00 2001
From: shihaobai <1798930569@qq.com>
Date: Mon, 12 Jan 2026 11:57:48 +0000
Subject: [PATCH 08/65] fix

---
 .../basemodel/layer_weights/meta_weights/mm_weight/mm_weight.py  | 1 -
 1 file changed, 1 deletion(-)

diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/mm_weight.py b/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/mm_weight.py
index 2bb7193c5..a7288b818 100644
--- a/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/mm_weight.py
+++ b/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/mm_weight.py
@@ -29,7 +29,6 @@ def __init__(
         tp_world_size: int = None,
     ) -> None:
         super().__init__(tp_rank, tp_world_size, data_type)
-        self.lock = threading.Lock()
 
         self.in_dim = in_dim
         if isinstance(out_dims, int):

From 7b2595a525792313aaeebcac5e20e7ee6da7d02f Mon Sep 17 00:00:00 2001
From: sufubao <sufubao@sensetime.com>
Date: Mon, 12 Jan 2026 10:45:36 +0000
Subject: [PATCH 09/65] MOE

---
 .../meta_weights/embedding_weight.py          |   9 +-
 .../fused_moe/fused_moe_weight_ep.py          | 138 ++++++++++++++---
 .../fused_moe/fused_moe_weight_tp.py          | 142 +++++++++++-------
 .../fused_moe/gpt_oss_fused_moe_weight_tp.py  |  68 ++++++++-
 .../layer_weights/meta_weights/norm_weight.py |   6 +-
 5 files changed, 274 insertions(+), 89 deletions(-)

diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/embedding_weight.py b/lightllm/common/basemodel/layer_weights/meta_weights/embedding_weight.py
index e228d5c86..e3dc0af19 100644
--- a/lightllm/common/basemodel/layer_weights/meta_weights/embedding_weight.py
+++ b/lightllm/common/basemodel/layer_weights/meta_weights/embedding_weight.py
@@ -12,7 +12,7 @@
 
 class EmbeddingWeight(BaseWeightTpl, PlatformAwareOp):
     def __init__(self, dim: int, vocab_size: int, weight_name: str, data_type: torch.dtype):
-        super().__init__()
+        BaseWeightTpl.__init__(self, data_type=data_type)
         self.dim = dim
         self.vocab_size = vocab_size
         self.tp_world_size_ = get_dp_world_size()
@@ -24,6 +24,7 @@ def __init__(self, dim: int, vocab_size: int, weight_name: str, data_type: torch
         self.weight_name: str = weight_name
         self.data_type_ = data_type
         self._create_weight()
+        PlatformAwareOp.__init__(self)
 
     def _create_weight(self):
         tp_vocab_size = self.tp_vocab_end_id - self.tp_vocab_start_id
@@ -86,7 +87,7 @@ def __init__(
         data_type: torch.dtype,
         shared_weight: Optional[EmbeddingWeight] = None,
     ):
-        super().__init__()
+        BaseWeightTpl.__init__(self, data_type=data_type)
         self.dim = dim
         self.vocab_size = vocab_size
         self.tp_world_size_ = get_dp_world_size()
@@ -100,6 +101,7 @@ def __init__(
         self._shared_weight = shared_weight
         if shared_weight is None:
             self._create_weight()
+        PlatformAwareOp.__init__(self)
 
     @property
     def weight(self) -> torch.Tensor:
@@ -154,7 +156,7 @@ def __call__(self, input: torch.Tensor, out: Optional[torch.Tensor] = None, allo
 
 class NoTpPosEmbeddingWeight(BaseWeightTpl, PlatformAwareOp):
     def __init__(self, dim: int, max_position_embeddings: int, weight_name: str, data_type: torch.dtype):
-        super().__init__()
+        BaseWeightTpl.__init__(self, data_type=data_type)
         self.dim = dim
         self.max_position_embeddings = max_position_embeddings
         self.weight_name: str = weight_name
@@ -162,6 +164,7 @@ def __init__(self, dim: int, max_position_embeddings: int, weight_name: str, dat
         self.tp_world_size_ = 1
         self.tp_rank_ = 0
         self._create_weight()
+        PlatformAwareOp.__init__(self)
 
     def _create_weight(self):
         self.weight: torch.Tensor = torch.empty(
diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/fused_moe_weight_ep.py b/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/fused_moe_weight_ep.py
index 0923d5dea..9a4feccdb 100644
--- a/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/fused_moe_weight_ep.py
+++ b/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/fused_moe_weight_ep.py
@@ -1,9 +1,9 @@
-import os
 import torch
 import threading
 from typing import Optional, Tuple, List, Dict, Any
-from lightllm.utils.dist_utils import get_global_world_size, get_global_rank, get_current_device_id
-from lightllm.common.basemodel.layer_weights.meta_weights.base_weight import BaseWeight
+from lightllm.utils.dist_utils import get_global_world_size, get_global_rank
+from lightllm.common.basemodel.layer_weights.meta_weights.base_weight import BaseWeightTpl
+from lightllm.common.basemodel.layer_weights.meta_weights.platform_op import PlatformAwareOp
 from lightllm.common.fused_moe.grouped_fused_moe_ep import (
     fused_experts_impl,
     masked_group_gemm,
@@ -29,7 +29,7 @@
 logger = init_logger(__name__)
 
 
-class FusedMoeWeightEP(BaseWeight):
+class FusedMoeWeightEP(BaseWeightTpl, PlatformAwareOp):
     def __init__(
         self,
         gate_proj_name: str,
@@ -44,7 +44,7 @@ def __init__(
         quant_cfg=None,
         hidden_size: Optional[int] = None,
     ) -> None:
-        super().__init__()
+        BaseWeightTpl.__init__(self, data_type=data_type)
 
         self.layer_num = layer_num
         self.quant_method = quant_cfg.get_quant_method(layer_num, "fused_moe")
@@ -63,7 +63,6 @@ def __init__(
         self.w3_weight_name = up_proj_name
         self.e_score_correction_bias_name = e_score_correction_bias_name
         self.n_routed_experts = n_routed_experts
-        self.data_type_ = data_type
         self.hidden_size = hidden_size
 
         global_world_size = get_global_world_size()
@@ -113,6 +112,8 @@ def __init__(
         if self.hidden_size is not None:
             self._create_weight()
 
+        PlatformAwareOp.__init__(self)
+
     def _create_weight(self):
         """Pre-allocate GPU memory for fused MoE weights"""
         if self.hidden_size is None:
@@ -126,18 +127,22 @@ def _create_weight(self):
             # Default fallback - this will be corrected during load
             intermediate_size = self.hidden_size * 4
 
-        device_id = get_current_device_id()
-
         if not self.quantized_weight and self.quant_method is not None:
             # Quantized weights
             w1_pack = self.quant_method.create_weight(
-                total_expert_num * intermediate_size * 2, self.hidden_size, dtype=self.data_type_, device_id=device_id
+                total_expert_num * intermediate_size * 2,
+                self.hidden_size,
+                dtype=self.data_type_,
+                device_id=self.device_id_,
             )
             self.w1[0] = w1_pack.weight.view(total_expert_num, intermediate_size * 2, self.hidden_size)
             self.w1[1] = w1_pack.weight_scale.view(total_expert_num, intermediate_size * 2, self.hidden_size)
 
             w2_pack = self.quant_method.create_weight(
-                total_expert_num * self.hidden_size, intermediate_size, dtype=self.data_type_, device_id=device_id
+                total_expert_num * self.hidden_size,
+                intermediate_size,
+                dtype=self.data_type_,
+                device_id=self.device_id_,
             )
             self.w2[0] = w2_pack.weight.view(total_expert_num, self.hidden_size, intermediate_size)
             self.w2[1] = w2_pack.weight_scale.view(total_expert_num, self.hidden_size, intermediate_size)
@@ -146,25 +151,18 @@ def _create_weight(self):
             self.w1[0] = torch.empty(
                 (total_expert_num, intermediate_size * 2, self.hidden_size),
                 dtype=self.data_type_,
-                device=f"cuda:{device_id}",
+                device=f"cuda:{self.device_id_}",
             )
             self.w2[0] = torch.empty(
                 (total_expert_num, self.hidden_size, intermediate_size),
                 dtype=self.data_type_,
-                device=f"cuda:{device_id}",
+                device=f"cuda:{self.device_id_}",
             )
 
-    def experts(
-        self,
-        input_tensor,
-        router_logits,
-        top_k,
-        renormalize,
-        use_grouped_topk,
-        topk_group,
-        num_expert_group,
-        is_prefill,
+    def _select_experts(
+        self, input_tensor, router_logits, top_k, renormalize, use_grouped_topk, topk_group, num_expert_group
     ):
+        """Select experts and return topk weights and ids."""
         topk_weights, topk_ids = select_experts(
             hidden_states=input_tensor,
             router_logits=router_logits,
@@ -187,6 +185,74 @@ def experts(
                 expert_counter=self.routed_expert_counter_tensor,
                 enable_counter=self.auto_update_redundancy_expert,
             )
+        return topk_weights, topk_ids
+
+    def _native_forward(
+        self,
+        input_tensor,
+        router_logits,
+        top_k,
+        renormalize,
+        use_grouped_topk,
+        topk_group,
+        num_expert_group,
+        is_prefill,
+    ):
+        """PyTorch native implementation for EP MoE forward pass."""
+        topk_weights, topk_ids = self._select_experts(
+            input_tensor, router_logits, top_k, renormalize, use_grouped_topk, topk_group, num_expert_group
+        )
+
+        w1, w1_scale = self.w1
+        w2, w2_scale = self.w2
+
+        # Native PyTorch implementation (less optimized but works on all platforms)
+        batch_size, hidden_size = input_tensor.shape
+        intermediate_size = w1.shape[1] // 2
+
+        output = torch.zeros_like(input_tensor)
+
+        for i in range(batch_size):
+            expert_output = torch.zeros(hidden_size, dtype=input_tensor.dtype, device=input_tensor.device)
+            for j in range(top_k):
+                expert_idx = topk_ids[i, j].item()
+                weight = topk_weights[i, j]
+
+                # Get local expert index (EP mode uses local expert indices)
+                local_expert_idx = expert_idx % self.ep_load_expert_num
+
+                # Get expert weights
+                w1_expert = w1[local_expert_idx, :intermediate_size, :]  # gate
+                w3_expert = w1[local_expert_idx, intermediate_size:, :]  # up
+                w2_expert = w2[local_expert_idx]
+
+                # Compute: SiLU(x @ w1.T) * (x @ w3.T) @ w2.T
+                x = input_tensor[i : i + 1]
+                gate = torch.nn.functional.silu(torch.mm(x, w1_expert.T))
+                up = torch.mm(x, w3_expert.T)
+                hidden = gate * up
+                expert_out = torch.mm(hidden, w2_expert.T)
+                expert_output += weight * expert_out.squeeze(0)
+
+            output[i] = expert_output
+
+        return output
+
+    def _cuda_forward(
+        self,
+        input_tensor,
+        router_logits,
+        top_k,
+        renormalize,
+        use_grouped_topk,
+        topk_group,
+        num_expert_group,
+        is_prefill,
+    ):
+        """CUDA optimized implementation for EP MoE forward pass."""
+        topk_weights, topk_ids = self._select_experts(
+            input_tensor, router_logits, top_k, renormalize, use_grouped_topk, topk_group, num_expert_group
+        )
 
         w1, w1_scale = self.w1
         w2, w2_scale = self.w2
@@ -207,6 +273,29 @@ def experts(
             previous_event=None,  # for overlap
         )
 
+    def experts(
+        self,
+        input_tensor,
+        router_logits,
+        top_k,
+        renormalize,
+        use_grouped_topk,
+        topk_group,
+        num_expert_group,
+        is_prefill,
+    ):
+        """Backward compatible method that routes to platform-specific implementation."""
+        return self._forward(
+            input_tensor=input_tensor,
+            router_logits=router_logits,
+            top_k=top_k,
+            renormalize=renormalize,
+            use_grouped_topk=use_grouped_topk,
+            topk_group=topk_group,
+            num_expert_group=num_expert_group,
+            is_prefill=is_prefill,
+        )
+
     def low_latency_dispatch(
         self,
         hidden_states: torch.Tensor,
@@ -651,10 +740,9 @@ def _copy_expert_scales(self, target_idx, expert_id, weights):
             self.w2[1][target_idx].copy_(w2_scale_tensor)
 
     def _cuda(self, cpu_tensor):
-        device_id = get_current_device_id()
         if self.quantized_weight:
-            return cpu_tensor.contiguous().cuda(device_id)
-        return cpu_tensor.contiguous().to(self.data_type_).cuda(device_id)
+            return cpu_tensor.contiguous().cuda(self.device_id_)
+        return cpu_tensor.contiguous().to(self.data_type_).cuda(self.device_id_)
 
     def verify_load(self):
         return self.w1 is not None and self.w2 is not None
diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/fused_moe_weight_tp.py b/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/fused_moe_weight_tp.py
index bf7b218b7..d30475444 100644
--- a/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/fused_moe_weight_tp.py
+++ b/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/fused_moe_weight_tp.py
@@ -1,9 +1,7 @@
-import os
 import torch
-import threading
-from typing import Tuple, List, Dict, Any, Union, Callable
-from lightllm.common.basemodel.layer_weights.meta_weights.base_weight import BaseWeight
-from lightllm.utils.dist_utils import get_current_rank_in_dp, get_current_device_id, get_dp_world_size
+from typing import Dict, Any, Union
+from lightllm.common.basemodel.layer_weights.meta_weights.base_weight import BaseWeightTpl
+from lightllm.common.basemodel.layer_weights.meta_weights.platform_op import PlatformAwareOp
 from lightllm.common.quantization import Quantcfg
 from lightllm.common.quantization.quantize_method import WeightPack
 from lightllm.common.basemodel.layer_weights.meta_weights.mm_weight.mm_slicer import (
@@ -59,7 +57,7 @@ def create_tp_moe_wegiht_obj(
         )
 
 
-class FusedMoeWeightTP(BaseWeight):
+class FusedMoeWeightTP(BaseWeightTpl, PlatformAwareOp):
     def __init__(
         self,
         gate_proj_name: str,
@@ -75,7 +73,7 @@ def __init__(
         layer_num: int,
         quant_cfg: Quantcfg = None,
     ) -> None:
-        super().__init__()
+        BaseWeightTpl.__init__(self, data_type=data_type)
         self.quant_method = quant_cfg.get_quant_method(layer_num, "fused_moe")
         self.quantized_weight = quant_cfg.quantized_weight
         if self.quant_method.method_name != "none":
@@ -92,48 +90,49 @@ def __init__(
         self.num_fused_shared_experts = num_fused_shared_experts
         self.routed_scaling_factor = network_config.get("routed_scaling_factor", 1.0)
         self.split_inter_size = split_inter_size
-        self.data_type_ = data_type
         self.hidden_size = network_config.get("hidden_size")
-        self.tp_rank_ = get_current_rank_in_dp()
         self.e_score_correction_bias = None
         self.scoring_func = network_config.get("scoring_func", "softmax")
         self.row_slicer = get_row_slice_mixin(
-            self.quant_method.method_name, tp_rank=self.tp_rank_, tp_world_size=get_dp_world_size()
+            self.quant_method.method_name, tp_rank=self.tp_rank_, tp_world_size=self.tp_world_size_
         )
         self.col_slicer = get_col_slice_mixin(
-            self.quant_method.method_name, tp_rank=self.tp_rank_, tp_world_size=get_dp_world_size()
+            self.quant_method.method_name, tp_rank=self.tp_rank_, tp_world_size=self.tp_world_size_
         )
         self._create_weight()
+        PlatformAwareOp.__init__(self)
 
     def _create_weight(self):
         total_expert_num = self.n_routed_experts
         intermediate_size = self.split_inter_size
-        device_id = get_current_device_id()
 
         # Create e_score_correction_bias
         if self.e_score_correction_bias is not None:
             self.e_score_correction_bias = torch.empty(
                 (total_expert_num,),
                 dtype=self.data_type_,
-                device=f"cuda:{device_id}",
+                device=f"cuda:{self.device_id_}",
             )
 
         self.w13: WeightPack = self.quant_method.create_weight(
             out_dim=intermediate_size * 2,
             in_dim=self.hidden_size,
             dtype=self.data_type_,
-            device_id=device_id,
+            device_id=self.device_id_,
             num_experts=total_expert_num,
         )
         self.w2: WeightPack = self.quant_method.create_weight(
             out_dim=self.hidden_size,
             in_dim=intermediate_size,
             dtype=self.data_type_,
-            device_id=device_id,
+            device_id=self.device_id_,
             num_experts=total_expert_num,
         )
 
-    def experts(self, input_tensor, router_logits, top_k, renormalize, use_grouped_topk, topk_group, num_expert_group):
+    def _select_experts(
+        self, input_tensor, router_logits, top_k, renormalize, use_grouped_topk, topk_group, num_expert_group
+    ):
+        """Select experts and return topk weights and ids."""
         from lightllm.common.fused_moe.topk_select import select_experts
 
         topk_weights, topk_ids = select_experts(
@@ -169,6 +168,53 @@ def experts(self, input_tensor, router_logits, top_k, renormalize, use_grouped_t
 
             topk_ids = torch.cat([topk_ids, pad_topk_ids], dim=1)
             topk_weights = torch.cat([topk_weights, pad_topk_weights], dim=1)
+        return topk_weights, topk_ids
+
+    def _native_forward(
+        self, input_tensor, router_logits, top_k, renormalize, use_grouped_topk, topk_group, num_expert_group
+    ):
+        topk_weights, topk_ids = self._select_experts(
+            input_tensor, router_logits, top_k, renormalize, use_grouped_topk, topk_group, num_expert_group
+        )
+
+        w13, _ = self.w13.weight, self.w13.weight_scale
+        w2, _ = self.w2.weight, self.w2.weight_scale
+
+        batch_size, hidden_size = input_tensor.shape
+        intermediate_size = w13.shape[1] // 2
+
+        output = torch.zeros_like(input_tensor)
+
+        for i in range(batch_size):
+            expert_output = torch.zeros(hidden_size, dtype=input_tensor.dtype, device=input_tensor.device)
+            for j in range(top_k):
+                expert_idx = topk_ids[i, j].item()
+                weight = topk_weights[i, j]
+
+                w1 = w13[expert_idx, :intermediate_size, :]  # gate
+                w3 = w13[expert_idx, intermediate_size:, :]  # up
+                w2_expert = w2[expert_idx]
+
+                # Compute: SiLU(x @ w1.T) * (x @ w3.T) @ w2.T
+                x = input_tensor[i : i + 1]
+                gate = torch.nn.functional.silu(torch.mm(x, w1.T))
+                up = torch.mm(x, w3.T)
+                hidden = gate * up
+                expert_out = torch.mm(hidden, w2_expert.T)
+                expert_output += weight * expert_out.squeeze(0)
+
+            output[i] = expert_output
+
+        input_tensor.copy_(output)
+        return
+
+    def _cuda_forward(
+        self, input_tensor, router_logits, top_k, renormalize, use_grouped_topk, topk_group, num_expert_group
+    ):
+        """CUDA optimized implementation of MoE forward pass."""
+        topk_weights, topk_ids = self._select_experts(
+            input_tensor, router_logits, top_k, renormalize, use_grouped_topk, topk_group, num_expert_group
+        )
 
         w13, w13_scale = self.w13.weight, self.w13.weight_scale
         w2, w2_scale = self.w2.weight, self.w2.weight_scale
@@ -189,11 +235,22 @@ def experts(self, input_tensor, router_logits, top_k, renormalize, use_grouped_t
         )
         return
 
+    def experts(self, input_tensor, router_logits, top_k, renormalize, use_grouped_topk, topk_group, num_expert_group):
+        """Backward compatible method that routes to platform-specific implementation."""
+        return self._forward(
+            input_tensor=input_tensor,
+            router_logits=router_logits,
+            top_k=top_k,
+            renormalize=renormalize,
+            use_grouped_topk=use_grouped_topk,
+            topk_group=topk_group,
+            num_expert_group=num_expert_group,
+        )
+
     def _cuda(self, cpu_tensor):
-        device_id = get_current_device_id()
         if self.quantized_weight:
-            return cpu_tensor.cuda(device_id)
-        return cpu_tensor.cuda(device_id)
+            return cpu_tensor.cuda(self.device_id_)
+        return cpu_tensor.cuda(self.device_id_)
 
     def verify_load(self):
         return True
@@ -259,42 +316,19 @@ def __init__(self, *args, **kwargs):
 
         self.workspace = marlin_make_workspace_new(self.w13.weight.device, 4)
 
-    def experts(self, input_tensor, router_logits, top_k, renormalize, use_grouped_topk, topk_group, num_expert_group):
-        from lightllm.common.fused_moe.topk_select import select_experts
-
-        topk_weights, topk_ids = select_experts(
-            hidden_states=input_tensor,
-            router_logits=router_logits,
-            correction_bias=self.e_score_correction_bias,
-            use_grouped_topk=use_grouped_topk,
-            top_k=top_k,
-            renormalize=renormalize,
-            topk_group=topk_group,
-            num_expert_group=num_expert_group,
-            scoring_func=self.scoring_func,
+    def _native_forward(
+        self, input_tensor, router_logits, top_k, renormalize, use_grouped_topk, topk_group, num_expert_group
+    ):
+        """AWQ Marlin quantization requires CUDA, native forward not supported."""
+        raise NotImplementedError("AWQ Marlin MoE requires CUDA platform, native forward not supported.")
+
+    def _cuda_forward(
+        self, input_tensor, router_logits, top_k, renormalize, use_grouped_topk, topk_group, num_expert_group
+    ):
+        """CUDA optimized implementation using AWQ Marlin kernels."""
+        topk_weights, topk_ids = self._select_experts(
+            input_tensor, router_logits, top_k, renormalize, use_grouped_topk, topk_group, num_expert_group
         )
-        topk_weights.mul_(self.routed_scaling_factor)
-        if self.num_fused_shared_experts > 0:
-            pad_topk_ids = (
-                torch.arange(
-                    start=self.n_routed_experts - self.num_fused_shared_experts,
-                    end=self.n_routed_experts,
-                    step=1,
-                    dtype=topk_ids.dtype,
-                    device="cuda",
-                )
-                .view(1, self.num_fused_shared_experts)
-                .repeat(topk_ids.shape[0], 1)
-            )
-            pad_topk_weights = torch.full(
-                (topk_weights.shape[0], self.num_fused_shared_experts),
-                fill_value=1.0,
-                device="cuda",
-                dtype=topk_weights.dtype,
-            )
-
-            topk_ids = torch.cat([topk_ids, pad_topk_ids], dim=1)
-            topk_weights = torch.cat([topk_weights, pad_topk_weights], dim=1)
 
         w1, w1_scale, w1_zero_point = self.w13.weight, self.w13.weight_scale, self.w13.weight_zero_point
         w2, w2_scale, w2_zero_point = self.w2.weight, self.w2.weight_scale, self.w2.weight_zero_point
diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/gpt_oss_fused_moe_weight_tp.py b/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/gpt_oss_fused_moe_weight_tp.py
index 9d79ff7c2..9821b5ad6 100644
--- a/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/gpt_oss_fused_moe_weight_tp.py
+++ b/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/gpt_oss_fused_moe_weight_tp.py
@@ -1,10 +1,7 @@
-import os
 import torch
-import threading
-from typing import Optional, Tuple, List, Dict, Any
+from typing import Dict, Any
 
 from lightllm.common.basemodel.layer_weights.meta_weights.fused_moe.fused_moe_weight_tp import FusedMoeWeightTP
-from lightllm.utils.dist_utils import get_current_rank_in_dp, get_current_device_id
 from lightllm.common.quantization import Quantcfg
 from lightllm.utils.log_utils import init_logger
 
@@ -121,7 +118,56 @@ def router(self, router_logits, top_k):
         router_top_value = torch.nn.functional.softmax(router_top_value, dim=1, dtype=router_top_value.dtype)
         return router_top_value, router_indices
 
-    def experts(self, input_tensor, router_logits, top_k, renormalize, use_grouped_topk, topk_group, num_expert_group):
+    def _native_forward(
+        self, input_tensor, router_logits, top_k, renormalize, use_grouped_topk, topk_group, num_expert_group
+    ):
+        """PyTorch native implementation for GPT-OSS MoE forward pass."""
+        topk_weights, topk_ids = self.router(router_logits, top_k)
+
+        w1, w1_scale = self.w1
+        w2, w2_scale = self.w2
+
+        batch_size, hidden_size = input_tensor.shape
+
+        output = torch.zeros_like(input_tensor)
+        input_bf16 = input_tensor.to(torch.bfloat16)
+
+        for i in range(batch_size):
+            expert_output = torch.zeros(hidden_size, dtype=torch.bfloat16, device=input_tensor.device)
+            for j in range(top_k):
+                expert_idx = topk_ids[i, j].item()
+                weight = topk_weights[i, j]
+
+                w1_expert = w1[expert_idx]
+                w2_expert = w2[expert_idx]
+
+                x = input_bf16[i : i + 1]
+                hidden = torch.mm(x, w1_expert.T)  # [1, intermediate_size * 2]
+                if self.w1_bias is not None:
+                    hidden = hidden + self.w1_bias[expert_idx : expert_idx + 1]
+
+                gate = hidden[:, 0::2]
+                up = hidden[:, 1::2]
+
+                gate = torch.clamp(gate * self.alpha, -self.limit, self.limit)
+                gate = torch.nn.functional.sigmoid(gate)
+                hidden = gate * up
+
+                expert_out = torch.mm(hidden, w2_expert.T)
+                if self.w2_bias is not None:
+                    expert_out = expert_out + self.w2_bias[expert_idx : expert_idx + 1] / self.tp_world_size_
+
+                expert_output += weight * expert_out.squeeze(0)
+
+            output[i] = expert_output
+
+        input_tensor.copy_(output.to(input_tensor.dtype))
+        return output
+
+    def _cuda_forward(
+        self, input_tensor, router_logits, top_k, renormalize, use_grouped_topk, topk_group, num_expert_group
+    ):
+        """CUDA optimized implementation for GPT-OSS MoE forward pass."""
         topk_weights, topk_ids = self.router(router_logits, top_k)
 
         w1, w1_scale = self.w1
@@ -148,6 +194,18 @@ def experts(self, input_tensor, router_logits, top_k, renormalize, use_grouped_t
         )
         return output_tensor
 
+    def experts(self, input_tensor, router_logits, top_k, renormalize, use_grouped_topk, topk_group, num_expert_group):
+        """Backward compatible method that routes to platform-specific implementation."""
+        return self._forward(
+            input_tensor=input_tensor,
+            router_logits=router_logits,
+            top_k=top_k,
+            renormalize=renormalize,
+            use_grouped_topk=use_grouped_topk,
+            topk_group=topk_group,
+            num_expert_group=num_expert_group,
+        )
+
     def _convert_moe_packed_tensors(
         self,
         blocks,
diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/norm_weight.py b/lightllm/common/basemodel/layer_weights/meta_weights/norm_weight.py
index 023201610..b13259353 100644
--- a/lightllm/common/basemodel/layer_weights/meta_weights/norm_weight.py
+++ b/lightllm/common/basemodel/layer_weights/meta_weights/norm_weight.py
@@ -12,12 +12,13 @@
 
 class RMSNormWeight(BaseWeightTpl, PlatformAwareOp):
     def __init__(self, dim: int, weight_name: str, data_type: torch.dtype, bias_name: str = None):
-        super().__init__()
+        BaseWeightTpl.__init__(self, data_type=data_type)
         self.dim = dim
         self.weight_name = weight_name
         self.data_type_ = data_type
         assert bias_name is None, "RMSNormWeight does not have bias"
         self._create_weight()
+        PlatformAwareOp.__init__(self)
 
     def _create_weight(self):
         self.weight: torch.Tensor = torch.empty(self.dim, dtype=self.data_type_, device=self.device_id_)
@@ -57,12 +58,13 @@ def __call__(
 
 class LayerNormWeight(BaseWeightTpl, PlatformAwareOp):
     def __init__(self, dim: int, weight_name: str, data_type: torch.dtype, bias_name: str = None):
-        super().__init__()
+        BaseWeightTpl.__init__(self, data_type=data_type)
         self.dim = dim
         self.weight_name = weight_name
         self.bias_name = bias_name
         self.data_type_ = data_type
         self._create_weight()
+        PlatformAwareOp.__init__(self)
 
     def _create_weight(self):
         self.weight: torch.Tensor = torch.empty(self.dim, dtype=self.data_type_, device=self.device_id_)

From 4df3637318eb423b0557abb4e8d9306225bb4635 Mon Sep 17 00:00:00 2001
From: shihaobai <1798930569@qq.com>
Date: Mon, 12 Jan 2026 12:09:08 +0000
Subject: [PATCH 10/65] remove data_type

---
 .../basemodel/layer_weights/meta_weights/norm_weight.py     | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/norm_weight.py b/lightllm/common/basemodel/layer_weights/meta_weights/norm_weight.py
index b13259353..023201610 100644
--- a/lightllm/common/basemodel/layer_weights/meta_weights/norm_weight.py
+++ b/lightllm/common/basemodel/layer_weights/meta_weights/norm_weight.py
@@ -12,13 +12,12 @@
 
 class RMSNormWeight(BaseWeightTpl, PlatformAwareOp):
     def __init__(self, dim: int, weight_name: str, data_type: torch.dtype, bias_name: str = None):
-        BaseWeightTpl.__init__(self, data_type=data_type)
+        super().__init__()
         self.dim = dim
         self.weight_name = weight_name
         self.data_type_ = data_type
         assert bias_name is None, "RMSNormWeight does not have bias"
         self._create_weight()
-        PlatformAwareOp.__init__(self)
 
     def _create_weight(self):
         self.weight: torch.Tensor = torch.empty(self.dim, dtype=self.data_type_, device=self.device_id_)
@@ -58,13 +57,12 @@ def __call__(
 
 class LayerNormWeight(BaseWeightTpl, PlatformAwareOp):
     def __init__(self, dim: int, weight_name: str, data_type: torch.dtype, bias_name: str = None):
-        BaseWeightTpl.__init__(self, data_type=data_type)
+        super().__init__()
         self.dim = dim
         self.weight_name = weight_name
         self.bias_name = bias_name
         self.data_type_ = data_type
         self._create_weight()
-        PlatformAwareOp.__init__(self)
 
     def _create_weight(self):
         self.weight: torch.Tensor = torch.empty(self.dim, dtype=self.data_type_, device=self.device_id_)

From 101e89aa837f6878978c70a5509da91ecd94dffb Mon Sep 17 00:00:00 2001
From: shihaobai <1798930569@qq.com>
Date: Mon, 12 Jan 2026 12:10:20 +0000
Subject: [PATCH 11/65] remove fused_moe_weight_tp

---
 .../meta_weights/fused_moe_weight_tp.py       | 669 ------------------
 1 file changed, 669 deletions(-)
 delete mode 100644 lightllm/common/basemodel/layer_weights/meta_weights/fused_moe_weight_tp.py

diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe_weight_tp.py b/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe_weight_tp.py
deleted file mode 100644
index 9295fa96a..000000000
--- a/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe_weight_tp.py
+++ /dev/null
@@ -1,669 +0,0 @@
-import os
-import torch
-import threading
-from typing import Optional, Tuple, List, Dict, Any, Union
-from .base_weight import BaseWeight
-from lightllm.utils.dist_utils import get_current_rank_in_dp, get_current_device_id
-from lightllm.common.quantization import Quantcfg
-
-
-def create_tp_moe_wegiht_obj(
-    gate_proj_name: str,
-    down_proj_name: str,
-    up_proj_name: str,
-    e_score_correction_bias_name: str,
-    weight_prefix: str,
-    n_routed_experts: int,
-    num_fused_shared_experts: int,
-    split_inter_size: int,
-    data_type: torch.dtype,
-    network_config: Dict[str, Any],
-    layer_num: int,
-    quant_cfg: Quantcfg = None,
-) -> Union["FusedMoeWeightTP", "FusedAWQMARLINMoeWeightTP"]:
-    quant_method = quant_cfg.get_quant_method(layer_num, "fused_moe")
-    if quant_method is not None and quant_method.method_name == "awq_marlin":
-        return FusedAWQMARLINMoeWeightTP(
-            gate_proj_name=gate_proj_name,
-            down_proj_name=down_proj_name,
-            up_proj_name=up_proj_name,
-            e_score_correction_bias_name=e_score_correction_bias_name,
-            weight_prefix=weight_prefix,
-            n_routed_experts=n_routed_experts,
-            num_fused_shared_experts=num_fused_shared_experts,
-            split_inter_size=split_inter_size,
-            data_type=data_type,
-            network_config=network_config,
-            layer_num=layer_num,
-            quant_cfg=quant_cfg,
-        )
-    else:
-        return FusedMoeWeightTP(
-            gate_proj_name=gate_proj_name,
-            down_proj_name=down_proj_name,
-            up_proj_name=up_proj_name,
-            e_score_correction_bias_name=e_score_correction_bias_name,
-            weight_prefix=weight_prefix,
-            n_routed_experts=n_routed_experts,
-            num_fused_shared_experts=num_fused_shared_experts,
-            split_inter_size=split_inter_size,
-            data_type=data_type,
-            network_config=network_config,
-            layer_num=layer_num,
-            quant_cfg=quant_cfg,
-        )
-
-
-class FusedMoeWeightTP(BaseWeight):
-    def __init__(
-        self,
-        gate_proj_name: str,
-        down_proj_name: str,
-        up_proj_name: str,
-        e_score_correction_bias_name: str,
-        weight_prefix: str,
-        n_routed_experts: int,
-        num_fused_shared_experts: int,
-        split_inter_size: int,
-        data_type: torch.dtype,
-        network_config: Dict[str, Any],
-        layer_num: int,
-        quant_cfg: Quantcfg = None,
-    ) -> None:
-        super().__init__()
-        self.quant_method = quant_cfg.get_quant_method(layer_num, "fused_moe")
-        self.quantized_weight = quant_cfg.quantized_weight
-        if self.quant_method is not None:
-            self.weight_scale_suffix = self.quant_method.weight_scale_suffix
-            self.quant_method.is_moe = True
-        self.w1_weight_name = gate_proj_name
-        self.w2_weight_name = down_proj_name
-        self.w3_weight_name = up_proj_name
-
-        self.e_score_correction_bias_name = e_score_correction_bias_name
-        self.weight_prefix = weight_prefix
-        assert num_fused_shared_experts in [0, 1], "num_fused_shared_experts can only support 0 or 1 now."
-        self.n_routed_experts = n_routed_experts + num_fused_shared_experts
-        self.num_fused_shared_experts = num_fused_shared_experts
-        self.routed_scaling_factor = network_config.get("routed_scaling_factor", 1.0)
-        self.split_inter_size = split_inter_size
-        self.data_type_ = data_type
-        self.tp_rank_ = get_current_rank_in_dp()
-        self.experts_up_projs = [None] * self.n_routed_experts
-        self.experts_gate_projs = [None] * self.n_routed_experts
-        self.experts_up_proj_scales = [None] * self.n_routed_experts
-        self.experts_gate_proj_scales = [None] * self.n_routed_experts
-        self.e_score_correction_bias = None
-        self.w2_list = [None] * self.n_routed_experts
-        self.w2_scale_list = [None] * self.n_routed_experts
-        self.scoring_func = network_config.get("scoring_func", "softmax")
-        self.w1 = [None, None]  # weight, weight_scale
-        self.w2 = [None, None]  # weight, weight_scale
-        self.lock = threading.Lock()
-
-    def experts(self, input_tensor, router_logits, top_k, renormalize, use_grouped_topk, topk_group, num_expert_group):
-        from lightllm.common.fused_moe.topk_select import select_experts
-
-        topk_weights, topk_ids = select_experts(
-            hidden_states=input_tensor,
-            router_logits=router_logits,
-            correction_bias=self.e_score_correction_bias,
-            use_grouped_topk=use_grouped_topk,
-            top_k=top_k,
-            renormalize=renormalize,
-            topk_group=topk_group,
-            num_expert_group=num_expert_group,
-            scoring_func=self.scoring_func,
-        )
-        topk_weights.mul_(self.routed_scaling_factor)
-        if self.num_fused_shared_experts > 0:
-            pad_topk_ids = (
-                torch.arange(
-                    start=self.n_routed_experts - self.num_fused_shared_experts,
-                    end=self.n_routed_experts,
-                    step=1,
-                    dtype=topk_ids.dtype,
-                    device="cuda",
-                )
-                .view(1, self.num_fused_shared_experts)
-                .repeat(topk_ids.shape[0], 1)
-            )
-            pad_topk_weights = torch.full(
-                (topk_weights.shape[0], self.num_fused_shared_experts),
-                fill_value=1.0,
-                device="cuda",
-                dtype=topk_weights.dtype,
-            )
-
-            topk_ids = torch.cat([topk_ids, pad_topk_ids], dim=1)
-            topk_weights = torch.cat([topk_weights, pad_topk_weights], dim=1)
-
-        w1, w1_scale = self.w1
-        w2, w2_scale = self.w2
-        use_fp8_w8a8 = self.quant_method is not None
-
-        from lightllm.common.fused_moe.grouped_fused_moe import fused_experts
-
-        fused_experts(
-            hidden_states=input_tensor,
-            w1=w1,
-            w2=w2,
-            topk_weights=topk_weights,
-            topk_ids=topk_ids,
-            inplace=True,
-            use_fp8_w8a8=use_fp8_w8a8,
-            w1_scale=w1_scale,
-            w2_scale=w2_scale,
-        )
-        return
-
-    def _fuse(self):
-        if self.quantized_weight:
-            self._fuse_weight_scale()
-        with self.lock:
-            if (
-                hasattr(self, "experts_up_projs")
-                and None not in self.experts_up_projs
-                and None not in self.experts_gate_projs
-                and None not in self.w2_list
-            ):
-                gate_out_dim, gate_in_dim = self.experts_gate_projs[0].shape
-                up_out_dim, up_in_dim = self.experts_up_projs[0].shape
-                assert gate_in_dim == up_in_dim
-                dtype = self.experts_gate_projs[0].dtype
-                total_expert_num = self.n_routed_experts
-
-                w1 = torch.empty((total_expert_num, gate_out_dim + up_out_dim, gate_in_dim), dtype=dtype, device="cpu")
-
-                for i_experts in range(self.n_routed_experts):
-                    w1[i_experts, 0:gate_out_dim:, :] = self.experts_gate_projs[i_experts]
-                    w1[i_experts, gate_out_dim:, :] = self.experts_up_projs[i_experts]
-
-                inter_shape, hidden_size = self.w2_list[0].shape[0], self.w2_list[0].shape[1]
-                w2 = torch._utils._flatten_dense_tensors(self.w2_list).view(len(self.w2_list), inter_shape, hidden_size)
-                if not self.quantized_weight and self.quant_method is not None:
-                    qw1, qw1_scale, qw1_zero_point = self.quant_method.quantize(w1)
-                    qw2, qw2_scale, qw2_zero_point = self.quant_method.quantize(w2)
-                    self.w1[0] = qw1
-                    self.w1[1] = qw1_scale
-                    self.w2[0] = qw2
-                    self.w2[1] = qw2_scale
-                else:
-                    self.w1[0] = self._cuda(w1)
-                    self.w2[0] = self._cuda(w2)
-                delattr(self, "w2_list")
-                delattr(self, "experts_up_projs")
-                delattr(self, "experts_gate_projs")
-
-    def _fuse_weight_scale(self):
-        with self.lock:
-            if (
-                hasattr(self, "experts_up_proj_scales")
-                and None not in self.experts_up_proj_scales
-                and None not in self.experts_gate_proj_scales
-                and None not in self.w2_scale_list
-            ):
-                gate_out_dim, gate_in_dim = self.experts_gate_proj_scales[0].shape
-                up_out_dim, up_in_dim = self.experts_up_proj_scales[0].shape
-                assert gate_in_dim == up_in_dim
-                dtype = self.experts_gate_proj_scales[0].dtype
-                total_expert_num = self.n_routed_experts
-
-                w1_scale = torch.empty(
-                    (total_expert_num, gate_out_dim + up_out_dim, gate_in_dim), dtype=dtype, device="cpu"
-                )
-
-                for i_experts in range(self.n_routed_experts):
-                    w1_scale[i_experts, 0:gate_out_dim:, :] = self.experts_gate_proj_scales[i_experts]
-                    w1_scale[i_experts, gate_out_dim:, :] = self.experts_up_proj_scales[i_experts]
-                inter_shape, hidden_size = self.w2_scale_list[0].shape[0], self.w2_scale_list[0].shape[1]
-                w2_scale = torch._utils._flatten_dense_tensors(self.w2_scale_list).view(
-                    len(self.w2_scale_list), inter_shape, hidden_size
-                )
-                self.w1[1] = self._cuda(w1_scale)
-                self.w2[1] = self._cuda(w2_scale)
-                delattr(self, "w2_scale_list")
-                delattr(self, "experts_up_proj_scales")
-                delattr(self, "experts_gate_proj_scales")
-
-    def load_hf_weights(self, weights):
-        if self.e_score_correction_bias_name in weights:
-            self.e_score_correction_bias = self._cuda(weights[self.e_score_correction_bias_name])
-        for i_experts in range(self.n_routed_experts):
-            w1_weight = f"{self.weight_prefix}.{i_experts}.{self.w1_weight_name}.weight"
-            w2_weight = f"{self.weight_prefix}.{i_experts}.{self.w2_weight_name}.weight"
-            w3_weight = f"{self.weight_prefix}.{i_experts}.{self.w3_weight_name}.weight"
-
-            if w1_weight in weights:
-                self.experts_gate_projs[i_experts] = weights[w1_weight][
-                    self.split_inter_size * self.tp_rank_ : self.split_inter_size * (self.tp_rank_ + 1), :
-                ]
-            if w3_weight in weights:
-                self.experts_up_projs[i_experts] = weights[w3_weight][
-                    self.split_inter_size * self.tp_rank_ : self.split_inter_size * (self.tp_rank_ + 1), :
-                ]
-
-            if w2_weight in weights:
-                self.w2_list[i_experts] = weights[w2_weight][
-                    :, self.split_inter_size * self.tp_rank_ : self.split_inter_size * (self.tp_rank_ + 1)
-                ]
-        if self.quant_method is not None:
-            self._load_weight_scale(weights)
-        self._fuse()
-
-    def _load_weight_scale(self, weights: Dict[str, torch.Tensor]) -> None:
-        block_size = 1
-        if hasattr(self.quant_method, "block_size"):
-            block_size = self.quant_method.block_size
-        for i_experts in range(self.n_routed_experts):
-            w1_scale = f"{self.weight_prefix}.{i_experts}.{self.w1_weight_name}.{self.weight_scale_suffix}"
-            w2_scale = f"{self.weight_prefix}.{i_experts}.{self.w2_weight_name}.{self.weight_scale_suffix}"
-            w3_scale = f"{self.weight_prefix}.{i_experts}.{self.w3_weight_name}.{self.weight_scale_suffix}"
-            if w1_scale in weights:
-                self.experts_gate_proj_scales[i_experts] = weights[w1_scale][
-                    self.split_inter_size
-                    // block_size
-                    * self.tp_rank_ : self.split_inter_size
-                    // block_size
-                    * (self.tp_rank_ + 1),
-                    :,
-                ]
-            if w3_scale in weights:
-                self.experts_up_proj_scales[i_experts] = weights[w3_scale][
-                    self.split_inter_size
-                    // block_size
-                    * self.tp_rank_ : self.split_inter_size
-                    // block_size
-                    * (self.tp_rank_ + 1),
-                    :,
-                ]
-
-            if w2_scale in weights:
-                self.w2_scale_list[i_experts] = weights[w2_scale][
-                    :,
-                    self.split_inter_size
-                    // block_size
-                    * self.tp_rank_ : self.split_inter_size
-                    // block_size
-                    * (self.tp_rank_ + 1),
-                ]
-
-    def _cuda(self, cpu_tensor):
-        device_id = get_current_device_id()
-        if self.quantized_weight:
-            return cpu_tensor.contiguous().cuda(device_id)
-        return cpu_tensor.contiguous().to(self.data_type_).cuda(device_id)
-
-    def verify_load(self):
-        return self.w1 is not None and self.w2 is not None
-
-
-class FusedAWQMARLINMoeWeightTP(BaseWeight):
-    def __init__(
-        self,
-        gate_proj_name: str,
-        down_proj_name: str,
-        up_proj_name: str,
-        e_score_correction_bias_name: str,
-        weight_prefix: str,
-        n_routed_experts: int,
-        num_fused_shared_experts: int,
-        split_inter_size: int,
-        data_type: torch.dtype,
-        network_config: Dict[str, Any],
-        layer_num: int,
-        quant_cfg: Quantcfg = None,
-    ) -> None:
-        super().__init__()
-        self.quant_method = quant_cfg.get_quant_method(layer_num, "fused_moe")
-        self.quantized_weight = quant_cfg.quantized_weight
-        if self.quant_method is not None:
-            self.weight_scale_suffix = self.quant_method.weight_scale_suffix
-            self.weight_zero_point_suffix = self.quant_method.weight_zero_point_suffix
-            self.quant_method.is_moe = True
-        hf_quantization_config = network_config.get("quantization_config", None)
-        self.num_bits = hf_quantization_config.get("bits", 4)
-        self.group_size = hf_quantization_config.get("group_size", 128)
-        self.pack_factor = 32 // self.num_bits
-        self.has_processed_weight = False
-        assert self.quant_method.method_name == "awq_marlin"
-
-        self.w1_weight_name = gate_proj_name
-        self.w2_weight_name = down_proj_name
-        self.w3_weight_name = up_proj_name
-
-        self.e_score_correction_bias_name = e_score_correction_bias_name
-        self.weight_prefix = weight_prefix
-        assert num_fused_shared_experts in [0, 1], "num_fused_shared_experts can only support 0 or 1 now."
-        self.n_routed_experts = n_routed_experts + num_fused_shared_experts
-        self.num_fused_shared_experts = num_fused_shared_experts
-        self.routed_scaling_factor = network_config.get("routed_scaling_factor", 1.0)
-        self.split_inter_size = split_inter_size
-        self.data_type_ = data_type
-        self.tp_rank_ = get_current_rank_in_dp()
-        self.experts_up_projs = [None] * self.n_routed_experts
-        self.experts_gate_projs = [None] * self.n_routed_experts
-        self.experts_up_proj_scales = [None] * self.n_routed_experts
-        self.experts_up_proj_zero_points = [None] * self.n_routed_experts
-        self.experts_gate_proj_scales = [None] * self.n_routed_experts
-        self.experts_gate_proj_zero_points = [None] * self.n_routed_experts
-        self.e_score_correction_bias = None
-        self.w2_list = [None] * self.n_routed_experts
-        self.w2_scale_list = [None] * self.n_routed_experts
-        self.w2_zero_point_list = [None] * self.n_routed_experts
-        self.scoring_func = network_config.get("scoring_func", "softmax")
-        self.w1 = [None, None, None]  # weight, weight_scale, zero_point
-        self.w2 = [None, None, None]  # weight, weight_scale, zero_point
-        self.lock = threading.Lock()
-
-    def experts(self, input_tensor, router_logits, top_k, renormalize, use_grouped_topk, topk_group, num_expert_group):
-        from lightllm.common.fused_moe.topk_select import select_experts
-
-        topk_weights, topk_ids = select_experts(
-            hidden_states=input_tensor,
-            router_logits=router_logits,
-            correction_bias=self.e_score_correction_bias,
-            use_grouped_topk=use_grouped_topk,
-            top_k=top_k,
-            renormalize=renormalize,
-            topk_group=topk_group,
-            num_expert_group=num_expert_group,
-            scoring_func=self.scoring_func,
-        )
-        topk_weights.mul_(self.routed_scaling_factor)
-        if self.num_fused_shared_experts > 0:
-            pad_topk_ids = (
-                torch.arange(
-                    start=self.n_routed_experts - self.num_fused_shared_experts,
-                    end=self.n_routed_experts,
-                    step=1,
-                    dtype=topk_ids.dtype,
-                    device="cuda",
-                )
-                .view(1, self.num_fused_shared_experts)
-                .repeat(topk_ids.shape[0], 1)
-            )
-            pad_topk_weights = torch.full(
-                (topk_weights.shape[0], self.num_fused_shared_experts),
-                fill_value=1.0,
-                device="cuda",
-                dtype=topk_weights.dtype,
-            )
-
-            topk_ids = torch.cat([topk_ids, pad_topk_ids], dim=1)
-            topk_weights = torch.cat([topk_weights, pad_topk_weights], dim=1)
-
-        w1, w1_scale, w1_zero_point = self.w1
-        w2, w2_scale, w2_zero_point = self.w2
-
-        from vllm.model_executor.layers.fused_moe.fused_marlin_moe import fused_marlin_moe
-
-        fused_marlin_moe(
-            input_tensor,
-            w1,
-            w2,
-            None,
-            None,
-            w1_scale,
-            w2_scale,
-            router_logits,
-            topk_weights,
-            topk_ids,
-            quant_type_id=self.quant_method.vllm_quant_type.id,
-            apply_router_weight_on_input=False,
-            global_num_experts=-1,
-            expert_map=None,
-            w1_zeros=w1_zero_point,
-            w2_zeros=w2_zero_point,
-            workspace=self.workspace,
-            inplace=True,
-        )
-
-        return
-
-    def _fuse(self):
-        self._fuse_weight()
-        self._fuse_weight_scale()
-        self._fuse_weight_zero_point()
-
-    def _fuse_weight(self):
-        with self.lock:
-            if (
-                hasattr(self, "experts_up_projs")
-                and None not in self.experts_up_projs
-                and None not in self.experts_gate_projs
-                and None not in self.w2_list
-            ):
-                gate_in_dim, gate_out_dim = self.experts_gate_projs[0].shape
-                up_in_dim, up_out_dim = self.experts_up_projs[0].shape
-                assert gate_in_dim == up_in_dim
-                total_expert_num = self.n_routed_experts
-
-                w1 = torch.empty(
-                    (total_expert_num, gate_in_dim, gate_out_dim + up_out_dim), dtype=torch.int32, device="cpu"
-                )
-
-                for i_experts in range(self.n_routed_experts):
-                    w1[i_experts, :, 0:gate_out_dim] = self.experts_gate_projs[i_experts]
-                    w1[i_experts, :, gate_out_dim:] = self.experts_up_projs[i_experts]
-
-                inter_shape, hidden_size = self.w2_list[0].shape[0], self.w2_list[0].shape[1]
-                w2 = torch._utils._flatten_dense_tensors(self.w2_list).view(len(self.w2_list), inter_shape, hidden_size)
-                self.w1[0] = self._cuda(w1)
-                self.w2[0] = self._cuda(w2)
-                delattr(self, "w2_list")
-                delattr(self, "experts_up_projs")
-                delattr(self, "experts_gate_projs")
-
-    def _fuse_weight_scale(self):
-        with self.lock:
-            if (
-                hasattr(self, "experts_up_proj_scales")
-                and None not in self.experts_up_proj_scales
-                and None not in self.experts_gate_proj_scales
-                and None not in self.w2_scale_list
-            ):
-                gate_in_dim, gate_out_dim = self.experts_gate_proj_scales[0].shape
-                up_in_dim, up_out_dim = self.experts_up_proj_scales[0].shape
-                dtype = self.experts_gate_proj_scales[0].dtype
-                assert gate_in_dim == up_in_dim
-                total_expert_num = self.n_routed_experts
-                w1_scale = torch.empty(
-                    (total_expert_num, gate_in_dim, gate_out_dim + up_out_dim), dtype=dtype, device="cpu"
-                )
-                for i_experts in range(self.n_routed_experts):
-                    w1_scale[i_experts, :, 0:gate_out_dim] = self.experts_gate_proj_scales[i_experts]
-                    w1_scale[i_experts, :, gate_out_dim:] = self.experts_up_proj_scales[i_experts]
-                inter_shape, hidden_size = self.w2_scale_list[0].shape[0], self.w2_scale_list[0].shape[1]
-                w2_scale = torch._utils._flatten_dense_tensors(self.w2_scale_list).view(
-                    len(self.w2_scale_list), inter_shape, hidden_size
-                )
-                self.w1[1] = self._cuda(w1_scale).to(self.data_type_)
-                self.w2[1] = self._cuda(w2_scale).to(self.data_type_)
-                delattr(self, "w2_scale_list")
-                delattr(self, "experts_up_proj_scales")
-                delattr(self, "experts_gate_proj_scales")
-
-    def _fuse_weight_zero_point(self):
-        with self.lock:
-            if (
-                hasattr(self, "experts_up_proj_zero_points")
-                and None not in self.experts_up_proj_zero_points
-                and None not in self.experts_gate_proj_zero_points
-                and None not in self.w2_zero_point_list
-            ):
-                gate_in_dim, gate_out_dim = self.experts_gate_proj_zero_points[0].shape
-                up_in_dim, up_out_dim = self.experts_up_proj_zero_points[0].shape
-                assert gate_in_dim == up_in_dim
-                total_expert_num = self.n_routed_experts
-                w1_zero_point = torch.empty(
-                    (total_expert_num, gate_in_dim, gate_out_dim + up_out_dim), dtype=torch.int32, device="cpu"
-                )
-                for i_experts in range(self.n_routed_experts):
-                    w1_zero_point[i_experts, :, 0:gate_out_dim] = self.experts_gate_proj_zero_points[i_experts]
-                    w1_zero_point[i_experts, :, gate_out_dim:] = self.experts_up_proj_zero_points[i_experts]
-                inter_shape, hidden_size = self.w2_zero_point_list[0].shape[0], self.w2_zero_point_list[0].shape[1]
-                w2_zero_point = torch._utils._flatten_dense_tensors(self.w2_zero_point_list).view(
-                    len(self.w2_zero_point_list), inter_shape, hidden_size
-                )
-                self.w1[2] = self._cuda(w1_zero_point)
-                self.w2[2] = self._cuda(w2_zero_point)
-                delattr(self, "w2_zero_point_list")
-                delattr(self, "experts_up_proj_zero_points")
-                delattr(self, "experts_gate_proj_zero_points")
-
-    def load_hf_weights(self, weights):
-        self._load_weight(weights)
-        self._load_weight_scale(weights)
-        self._load_weight_zero_point(weights)
-        self._fuse()
-        self._process_weight_after_loading()
-
-    def _load_weight(self, weights: Dict[str, torch.Tensor]) -> None:
-        # awq quantization weight shape: in x out
-        if self.e_score_correction_bias_name in weights:
-            self.e_score_correction_bias = self._cuda(weights[self.e_score_correction_bias_name])
-        for i_experts in range(self.n_routed_experts):
-            w1_weight = f"{self.weight_prefix}.{i_experts}.{self.w1_weight_name}.qweight"
-            w2_weight = f"{self.weight_prefix}.{i_experts}.{self.w2_weight_name}.qweight"
-            w3_weight = f"{self.weight_prefix}.{i_experts}.{self.w3_weight_name}.qweight"
-
-            if w1_weight in weights:
-                self.experts_gate_projs[i_experts] = weights[w1_weight][
-                    :, self.split_inter_size * self.tp_rank_ : self.split_inter_size * (self.tp_rank_ + 1)
-                ]
-            if w3_weight in weights:
-                self.experts_up_projs[i_experts] = weights[w3_weight][
-                    :, self.split_inter_size * self.tp_rank_ : self.split_inter_size * (self.tp_rank_ + 1)
-                ]
-
-            if w2_weight in weights:
-                self.w2_list[i_experts] = weights[w2_weight][
-                    self.split_inter_size * self.tp_rank_ : self.split_inter_size * (self.tp_rank_ + 1), :
-                ]
-
-    def _load_weight_scale(self, weights: Dict[str, torch.Tensor]) -> None:
-        for i_experts in range(self.n_routed_experts):
-            w1_scale = f"{self.weight_prefix}.{i_experts}.{self.w1_weight_name}.{self.weight_scale_suffix}"
-            w2_scale = f"{self.weight_prefix}.{i_experts}.{self.w2_weight_name}.{self.weight_scale_suffix}"
-            w3_scale = f"{self.weight_prefix}.{i_experts}.{self.w3_weight_name}.{self.weight_scale_suffix}"
-            split_inter_size = self.split_inter_size * self.pack_factor
-            if w1_scale in weights:
-                self.experts_gate_proj_scales[i_experts] = weights[w1_scale][
-                    :,
-                    split_inter_size * self.tp_rank_ : split_inter_size * (self.tp_rank_ + 1),
-                ]
-            if w3_scale in weights:
-                self.experts_up_proj_scales[i_experts] = weights[w3_scale][
-                    :,
-                    split_inter_size * self.tp_rank_ : split_inter_size * (self.tp_rank_ + 1),
-                ]
-
-            if w2_scale in weights:
-                self.w2_scale_list[i_experts] = weights[w2_scale][
-                    split_inter_size * self.tp_rank_ : split_inter_size * (self.tp_rank_ + 1),
-                    :,
-                ]
-
-    def _load_weight_zero_point(self, weights: Dict[str, torch.Tensor]) -> None:
-        for i_experts in range(self.n_routed_experts):
-            w1_zero_point = f"{self.weight_prefix}.{i_experts}.{self.w1_weight_name}.{self.weight_zero_point_suffix}"
-            w2_zero_point = f"{self.weight_prefix}.{i_experts}.{self.w2_weight_name}.{self.weight_zero_point_suffix}"
-            w3_zero_point = f"{self.weight_prefix}.{i_experts}.{self.w3_weight_name}.{self.weight_zero_point_suffix}"
-            if w1_zero_point in weights:
-                self.experts_gate_proj_zero_points[i_experts] = weights[w1_zero_point][
-                    :,
-                    self.split_inter_size * self.tp_rank_ : self.split_inter_size * (self.tp_rank_ + 1),
-                ]
-            if w3_zero_point in weights:
-                self.experts_up_proj_zero_points[i_experts] = weights[w3_zero_point][
-                    :,
-                    self.split_inter_size * self.tp_rank_ : self.split_inter_size * (self.tp_rank_ + 1),
-                ]
-            if w2_zero_point in weights:
-                self.w2_zero_point_list[i_experts] = weights[w2_zero_point][
-                    self.split_inter_size * self.tp_rank_ : self.split_inter_size * (self.tp_rank_ + 1),
-                    :,
-                ]
-
-    def _process_weight_after_loading(self):
-        with self.lock:
-            if None in self.w1 or None in self.w2 or self.has_processed_weight:
-                return
-        self.has_processed_weight = True
-        from lightllm.utils.vllm_utils import HAS_VLLM, vllm_ops
-
-        assert HAS_VLLM, "moe awq marlin quantization requires kernels of vllm"
-
-        from vllm.model_executor.layers.quantization.utils.marlin_utils import (
-            marlin_moe_permute_scales,
-            moe_awq_to_marlin_zero_points,
-            marlin_make_workspace_new,
-        )
-
-        num_experts = self.n_routed_experts
-        device = self.w1[0].device
-
-        self.w13_g_idx_sort_indices = torch.nn.Parameter(
-            torch.empty((num_experts, 0), dtype=torch.int32, device=device),
-            requires_grad=False,
-        )
-        self.w2_g_idx_sort_indices = torch.nn.Parameter(
-            torch.empty((num_experts, 0), dtype=torch.int32, device=device),
-            requires_grad=False,
-        )
-        self.w1[0] = vllm_ops.awq_marlin_moe_repack(
-            self.w1[0],
-            self.w13_g_idx_sort_indices,
-            size_k=self.w1[0].shape[1],
-            size_n=self.w1[0].shape[2] * self.pack_factor,
-            num_bits=self.num_bits,
-        )
-
-        self.w2[0] = vllm_ops.awq_marlin_moe_repack(
-            self.w2[0],
-            self.w2_g_idx_sort_indices,
-            size_k=self.w2[0].shape[1],
-            size_n=self.w2[0].shape[2] * self.pack_factor,
-            num_bits=self.num_bits,
-        )
-
-        # Why does this take the intermediate size for size_k?
-        self.w1[1] = marlin_moe_permute_scales(
-            s=self.w1[1],
-            size_k=self.split_inter_size * self.pack_factor,
-            size_n=self.w1[1].shape[2],
-            group_size=self.group_size,
-        )
-
-        self.w2[1] = marlin_moe_permute_scales(
-            s=self.w2[1],
-            size_k=self.split_inter_size * self.pack_factor,
-            size_n=self.w2[1].shape[2],
-            group_size=self.group_size,
-        )
-
-        self.w1[2] = moe_awq_to_marlin_zero_points(
-            self.w1[2],
-            size_k=self.w1[2].shape[1],
-            size_n=self.w1[2].shape[2] * self.pack_factor,
-            num_bits=self.num_bits,
-        )
-
-        self.w2[2] = moe_awq_to_marlin_zero_points(
-            self.w2[2],
-            size_k=self.w2[2].shape[1],
-            size_n=self.w2[2].shape[2] * self.pack_factor,
-            num_bits=self.num_bits,
-        )
-
-        self.workspace = marlin_make_workspace_new(device, 4)
-
-    def _cuda(self, cpu_tensor):
-        device_id = get_current_device_id()
-        if self.quantized_weight:
-            return cpu_tensor.cuda(device_id)
-        return cpu_tensor.cuda(device_id)
-
-    def verify_load(self):
-        return self.w1 is not None and self.w2 is not None

From b90666aa5d0ffe015e8d6aed191d05387cee8e69 Mon Sep 17 00:00:00 2001
From: shihaobai <1798930569@qq.com>
Date: Mon, 12 Jan 2026 13:11:47 +0000
Subject: [PATCH 12/65] qk norm

---
 .../layer_weights/meta_weights/__init__.py    |  2 +-
 .../fused_moe/fused_moe_weight_tp.py          |  8 ----
 .../layer_weights/meta_weights/norm_weight.py | 45 +++++++++++++++++--
 .../basemodel}/triton_kernel/qk_norm.py       |  0
 .../layer_infer/transformer_layer_infer.py    |  9 +---
 .../layer_weights/transformer_layer_weight.py |  6 +--
 .../layer_infer/transformer_layer_infer.py    | 15 +++----
 .../layer_infer/transformer_layer_infer.py    |  7 +--
 .../layer_infer/transformer_layer_infer.py    |  7 +--
 9 files changed, 58 insertions(+), 41 deletions(-)
 rename lightllm/{models/qwen3 => common/basemodel}/triton_kernel/qk_norm.py (100%)

diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/__init__.py b/lightllm/common/basemodel/layer_weights/meta_weights/__init__.py
index 47bf7c05f..fef70acf5 100644
--- a/lightllm/common/basemodel/layer_weights/meta_weights/__init__.py
+++ b/lightllm/common/basemodel/layer_weights/meta_weights/__init__.py
@@ -5,7 +5,7 @@
     KVROWNMMWeight,
     COLMMWeight,
 )
-from .norm_weight import TpRMSNormWeight, RMSNormWeight, LayerNormWeight, NoTpGEMMANormWeight
+from .norm_weight import TpRMSNormWeight, RMSNormWeight, LayerNormWeight, NoTpGEMMANormWeight, QKRMSNORMWeight
 from .embedding_weight import EmbeddingWeight, LMHeadWeight, NoTpPosEmbeddingWeight
 from .att_sink_weight import TpAttSinkWeight
 from .fused_moe.fused_moe_weight_tp import create_tp_moe_wegiht_obj
diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/fused_moe_weight_tp.py b/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/fused_moe_weight_tp.py
index d30475444..51822f790 100644
--- a/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/fused_moe_weight_tp.py
+++ b/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/fused_moe_weight_tp.py
@@ -247,14 +247,6 @@ def experts(self, input_tensor, router_logits, top_k, renormalize, use_grouped_t
             num_expert_group=num_expert_group,
         )
 
-    def _cuda(self, cpu_tensor):
-        if self.quantized_weight:
-            return cpu_tensor.cuda(self.device_id_)
-        return cpu_tensor.cuda(self.device_id_)
-
-    def verify_load(self):
-        return True
-
     def load_hf_weights(self, weights):
         # Load bias
         if self.e_score_correction_bias_name in weights:
diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/norm_weight.py b/lightllm/common/basemodel/layer_weights/meta_weights/norm_weight.py
index 023201610..73b937b77 100644
--- a/lightllm/common/basemodel/layer_weights/meta_weights/norm_weight.py
+++ b/lightllm/common/basemodel/layer_weights/meta_weights/norm_weight.py
@@ -4,11 +4,9 @@
 from lightllm.utils.dist_utils import get_current_device_id, get_current_rank_in_dp, get_dp_world_size
 from lightllm.common.basemodel.triton_kernel.rmsnorm import rmsnorm_forward
 from lightllm.common.basemodel.triton_kernel.layernorm import layernorm_forward
-from lightllm.utils.log_utils import init_logger
+from lightllm.common.basemodel.triton_kernel.qk_norm import qk_rmsnorm_forward
 from .platform_op import PlatformAwareOp
 
-logger = init_logger(__name__)
-
 
 class RMSNormWeight(BaseWeightTpl, PlatformAwareOp):
     def __init__(self, dim: int, weight_name: str, data_type: torch.dtype, bias_name: str = None):
@@ -152,3 +150,44 @@ def load_hf_weights(self, weights: Dict[str, torch.Tensor]):
         if self.weight_name in weights:
             self.weight.copy_(weights[self.weight_name])
         self.weight += 1
+
+
+class QKRMSNORMWeight(RMSNormWeight):
+    def __init__(self, dim: int, weight_name: str, data_type: torch.dtype, bias_name: str = None):
+        super().__init__(dim=dim, weight_name=weight_name, data_type=data_type, bias_name=bias_name)
+        self.tp_world_size_ = 1
+        self.tp_rank_ = 0
+
+    def _native_forward(
+        self,
+        input: torch.Tensor,
+        eps: float,
+    ) -> None:
+        assert input.ndim == 2 and self.weight.ndim == 1
+        assert input.shape[-1] == self.dim, f"Expected hidden_size to be {self.dim}, but found: {input.shape[-1]}"
+        head_dim = self.weight.shape[0]
+        x = input.to(torch.float32)
+        x = x.view(-1, head_dim)
+        x_var = x
+        variance = x_var.pow(2).mean(dim=-1, keepdim=True)
+        x = x * torch.rsqrt(variance + eps)
+        x = (x * self.weight).to(self.data_type_)
+        x = x.view(-1, input.shape[-1])
+        input.copy_(x)
+        return
+
+    def _cuda_forward(
+        self,
+        input: torch.Tensor,
+        eps: float,
+    ) -> None:
+        assert input.ndim == 2 and self.weight.ndim == 1
+        qk_rmsnorm_forward(x=input, weight=self.weight, eps=eps)
+        return
+
+    def __call__(
+        self,
+        input: torch.Tensor,
+        eps: float,
+    ) -> None:
+        return self._forward(input=input, eps=eps)
diff --git a/lightllm/models/qwen3/triton_kernel/qk_norm.py b/lightllm/common/basemodel/triton_kernel/qk_norm.py
similarity index 100%
rename from lightllm/models/qwen3/triton_kernel/qk_norm.py
rename to lightllm/common/basemodel/triton_kernel/qk_norm.py
diff --git a/lightllm/models/qwen3/layer_infer/transformer_layer_infer.py b/lightllm/models/qwen3/layer_infer/transformer_layer_infer.py
index 5f0c91287..82331f8fb 100644
--- a/lightllm/models/qwen3/layer_infer/transformer_layer_infer.py
+++ b/lightllm/models/qwen3/layer_infer/transformer_layer_infer.py
@@ -4,9 +4,6 @@
 from lightllm.models.llama.layer_infer.transformer_layer_infer import LlamaTransformerLayerInfer
 from lightllm.models.llama.infer_struct import LlamaInferStateInfo
 from lightllm.models.llama.triton_kernel.rotary_emb import rotary_emb_fwd
-from lightllm.models.llama.triton_kernel.silu_and_mul import silu_and_mul_fwd
-from lightllm.models.qwen3.triton_kernel.qk_norm import qk_rmsnorm_forward
-from functools import partial
 from lightllm.utils.log_utils import init_logger
 
 logger = init_logger(__name__)
@@ -27,14 +24,12 @@ def _get_qkv(
         input = input.view(-1, self.embed_dim_)
         q = layer_weight.q_proj.mm(input)
         cache_kv = layer_weight.kv_proj.mm(input)
-        qk_rmsnorm_forward(
+        layer_weight.q_norm_weight_(
             q,
-            weight=layer_weight.q_norm_weight_.weight,
             eps=self.eps_,
         )
-        qk_rmsnorm_forward(
+        layer_weight.k_norm_weight_(
             cache_kv[:, : self.tp_k_head_num_ * self.head_dim_],
-            weight=layer_weight.k_norm_weight_.weight,
             eps=self.eps_,
         )
         cache_kv = cache_kv.view(-1, (self.tp_k_head_num_ + self.tp_v_head_num_), self.head_dim_)
diff --git a/lightllm/models/qwen3/layer_weights/transformer_layer_weight.py b/lightllm/models/qwen3/layer_weights/transformer_layer_weight.py
index 014f4f6ac..7d2163f28 100644
--- a/lightllm/models/qwen3/layer_weights/transformer_layer_weight.py
+++ b/lightllm/models/qwen3/layer_weights/transformer_layer_weight.py
@@ -1,6 +1,6 @@
 from lightllm.models.qwen2.layer_weights.transformer_layer_weight import Qwen2TransformerLayerWeight
 from lightllm.common.basemodel.layer_weights.meta_weights import (
-    RMSNormWeight,
+    QKRMSNORMWeight,
 )
 
 
@@ -19,12 +19,12 @@ def _init_weight_names(self):
 
     def _init_norm(self):
         super()._init_norm()
-        self.q_norm_weight_ = RMSNormWeight(
+        self.q_norm_weight_ = QKRMSNORMWeight(
             dim=self.head_dim,
             weight_name=self._q_norm_name,
             data_type=self.data_type_,
         )
-        self.k_norm_weight_ = RMSNormWeight(
+        self.k_norm_weight_ = QKRMSNORMWeight(
             dim=self.head_dim,
             weight_name=self._k_norm_name,
             data_type=self.data_type_,
diff --git a/lightllm/models/qwen3_moe/layer_infer/transformer_layer_infer.py b/lightllm/models/qwen3_moe/layer_infer/transformer_layer_infer.py
index 5cd29dcdb..d273d51ad 100644
--- a/lightllm/models/qwen3_moe/layer_infer/transformer_layer_infer.py
+++ b/lightllm/models/qwen3_moe/layer_infer/transformer_layer_infer.py
@@ -60,16 +60,13 @@ def _get_qkv(
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         input = input.view(-1, self.embed_dim_)
         q = layer_weight.q_proj.mm(input)
-        cache_kv = layer_weight.kv_proj.mm(input).view(-1, (self.tp_k_head_num_ + self.tp_v_head_num_), self.head_dim_)
-
-        layer_weight.q_norm_weight_(q.view(-1, self.head_dim_), eps=self.eps_, out=q.view(-1, self.head_dim_))
-
-        cache_kv[:, : self.tp_k_head_num_, :] = layer_weight.k_norm_weight_(
-            input=cache_kv[:, : self.tp_k_head_num_, :].reshape(-1, cache_kv.shape[-1]),
+        cache_kv = layer_weight.kv_proj.mm(input)
+        layer_weight.q_norm_weight_(q, eps=self.eps_)
+        layer_weight.k_norm_weight_(
+            cache_kv[:, : self.tp_k_head_num_ * self.head_dim_],
             eps=self.eps_,
-            alloc_func=self.alloc_tensor,
-        ).view(-1, self.tp_k_head_num_, cache_kv.shape[-1])
-
+        )
+        cache_kv = cache_kv.view(-1, (self.tp_k_head_num_ + self.tp_v_head_num_), self.head_dim_)
         rotary_emb_fwd(
             q.view(-1, self.tp_q_head_num_, self.head_dim_),
             cache_kv[:, : self.tp_k_head_num_, :],
diff --git a/lightllm/models/qwen3_vl/layer_infer/transformer_layer_infer.py b/lightllm/models/qwen3_vl/layer_infer/transformer_layer_infer.py
index d1c51365a..d34babaab 100644
--- a/lightllm/models/qwen3_vl/layer_infer/transformer_layer_infer.py
+++ b/lightllm/models/qwen3_vl/layer_infer/transformer_layer_infer.py
@@ -9,7 +9,6 @@
 from lightllm.distributed import all_reduce
 from lightllm.models.qwen3_vl.triton_kernel.deepstack_multimodal_emb import apply_deepstack_features
 from lightllm.models.qwen2_vl.layer_infer.transformer_layer_infer import Qwen2VLTransformerLayerInfer
-from lightllm.models.qwen3.triton_kernel.qk_norm import qk_rmsnorm_forward
 from lightllm.utils.tensor_utils import tensor_to_no_ref_tensor
 
 
@@ -30,14 +29,12 @@ def _get_qkv(
         input = input.view(-1, self.embed_dim_)
         q = layer_weight.q_proj.mm(input)
         cache_kv = layer_weight.kv_proj.mm(input)
-        qk_rmsnorm_forward(
+        layer_weight.q_norm_weight_(
             q,
-            weight=layer_weight.q_norm_weight_.weight,
             eps=self.eps_,
         )
-        qk_rmsnorm_forward(
+        layer_weight.k_norm_weight_(
             cache_kv[:, : self.tp_k_head_num_ * self.head_dim_],
-            weight=layer_weight.k_norm_weight_.weight,
             eps=self.eps_,
         )
         cache_kv = cache_kv.view(-1, (self.tp_k_head_num_ + self.tp_v_head_num_), self.head_dim_)
diff --git a/lightllm/models/qwen3_vl_moe/layer_infer/transformer_layer_infer.py b/lightllm/models/qwen3_vl_moe/layer_infer/transformer_layer_infer.py
index 328cc0a62..4ccc6da37 100644
--- a/lightllm/models/qwen3_vl_moe/layer_infer/transformer_layer_infer.py
+++ b/lightllm/models/qwen3_vl_moe/layer_infer/transformer_layer_infer.py
@@ -5,7 +5,6 @@
 from lightllm.models.qwen3_moe.layer_infer.transformer_layer_infer import Qwen3MOETransformerLayerInfer
 from lightllm.models.qwen3_moe.layer_weights.transformer_layer_weight import Qwen3MOETransformerLayerWeight
 from lightllm.models.qwen3_vl.infer_struct import Qwen3VLInferStateInfo
-from lightllm.models.qwen3.triton_kernel.qk_norm import qk_rmsnorm_forward
 from lightllm.distributed import all_reduce
 from lightllm.models.qwen3_vl.triton_kernel.deepstack_multimodal_emb import apply_deepstack_features
 
@@ -26,14 +25,12 @@ def _get_qkv(
         input = input.view(-1, self.embed_dim_)
         q = layer_weight.q_proj.mm(input)
         cache_kv = layer_weight.kv_proj.mm(input)
-        qk_rmsnorm_forward(
+        layer_weight.q_norm_weight_(
             q,
-            weight=layer_weight.q_norm_weight_.weight,
             eps=self.eps_,
         )
-        qk_rmsnorm_forward(
+        layer_weight.k_norm_weight_(
             cache_kv[:, : self.tp_k_head_num_ * self.head_dim_],
-            weight=layer_weight.k_norm_weight_.weight,
             eps=self.eps_,
         )
         cache_kv = cache_kv.view(-1, (self.tp_k_head_num_ + self.tp_v_head_num_), self.head_dim_)

From d5d91924c9edb2261bb11c5fbdf4e4d1e7c48da4 Mon Sep 17 00:00:00 2001
From: shihaobai <1798930569@qq.com>
Date: Mon, 12 Jan 2026 13:13:53 +0000
Subject: [PATCH 13/65] remove PlatformAwareOp.__init__()

---
 .../layer_weights/meta_weights/embedding_weight.py       | 9 +++------
 .../meta_weights/fused_moe/fused_moe_weight_ep.py        | 4 +---
 .../meta_weights/fused_moe/fused_moe_weight_tp.py        | 3 +--
 3 files changed, 5 insertions(+), 11 deletions(-)

diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/embedding_weight.py b/lightllm/common/basemodel/layer_weights/meta_weights/embedding_weight.py
index e3dc0af19..e228d5c86 100644
--- a/lightllm/common/basemodel/layer_weights/meta_weights/embedding_weight.py
+++ b/lightllm/common/basemodel/layer_weights/meta_weights/embedding_weight.py
@@ -12,7 +12,7 @@
 
 class EmbeddingWeight(BaseWeightTpl, PlatformAwareOp):
     def __init__(self, dim: int, vocab_size: int, weight_name: str, data_type: torch.dtype):
-        BaseWeightTpl.__init__(self, data_type=data_type)
+        super().__init__()
         self.dim = dim
         self.vocab_size = vocab_size
         self.tp_world_size_ = get_dp_world_size()
@@ -24,7 +24,6 @@ def __init__(self, dim: int, vocab_size: int, weight_name: str, data_type: torch
         self.weight_name: str = weight_name
         self.data_type_ = data_type
         self._create_weight()
-        PlatformAwareOp.__init__(self)
 
     def _create_weight(self):
         tp_vocab_size = self.tp_vocab_end_id - self.tp_vocab_start_id
@@ -87,7 +86,7 @@ def __init__(
         data_type: torch.dtype,
         shared_weight: Optional[EmbeddingWeight] = None,
     ):
-        BaseWeightTpl.__init__(self, data_type=data_type)
+        super().__init__()
         self.dim = dim
         self.vocab_size = vocab_size
         self.tp_world_size_ = get_dp_world_size()
@@ -101,7 +100,6 @@ def __init__(
         self._shared_weight = shared_weight
         if shared_weight is None:
             self._create_weight()
-        PlatformAwareOp.__init__(self)
 
     @property
     def weight(self) -> torch.Tensor:
@@ -156,7 +154,7 @@ def __call__(self, input: torch.Tensor, out: Optional[torch.Tensor] = None, allo
 
 class NoTpPosEmbeddingWeight(BaseWeightTpl, PlatformAwareOp):
     def __init__(self, dim: int, max_position_embeddings: int, weight_name: str, data_type: torch.dtype):
-        BaseWeightTpl.__init__(self, data_type=data_type)
+        super().__init__()
         self.dim = dim
         self.max_position_embeddings = max_position_embeddings
         self.weight_name: str = weight_name
@@ -164,7 +162,6 @@ def __init__(self, dim: int, max_position_embeddings: int, weight_name: str, dat
         self.tp_world_size_ = 1
         self.tp_rank_ = 0
         self._create_weight()
-        PlatformAwareOp.__init__(self)
 
     def _create_weight(self):
         self.weight: torch.Tensor = torch.empty(
diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/fused_moe_weight_ep.py b/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/fused_moe_weight_ep.py
index 9a4feccdb..a84d19893 100644
--- a/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/fused_moe_weight_ep.py
+++ b/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/fused_moe_weight_ep.py
@@ -44,7 +44,7 @@ def __init__(
         quant_cfg=None,
         hidden_size: Optional[int] = None,
     ) -> None:
-        BaseWeightTpl.__init__(self, data_type=data_type)
+        super().__init__()
 
         self.layer_num = layer_num
         self.quant_method = quant_cfg.get_quant_method(layer_num, "fused_moe")
@@ -112,8 +112,6 @@ def __init__(
         if self.hidden_size is not None:
             self._create_weight()
 
-        PlatformAwareOp.__init__(self)
-
     def _create_weight(self):
         """Pre-allocate GPU memory for fused MoE weights"""
         if self.hidden_size is None:
diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/fused_moe_weight_tp.py b/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/fused_moe_weight_tp.py
index 51822f790..c6b3dc965 100644
--- a/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/fused_moe_weight_tp.py
+++ b/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/fused_moe_weight_tp.py
@@ -73,7 +73,7 @@ def __init__(
         layer_num: int,
         quant_cfg: Quantcfg = None,
     ) -> None:
-        BaseWeightTpl.__init__(self, data_type=data_type)
+        super().__init__()
         self.quant_method = quant_cfg.get_quant_method(layer_num, "fused_moe")
         self.quantized_weight = quant_cfg.quantized_weight
         if self.quant_method.method_name != "none":
@@ -100,7 +100,6 @@ def __init__(
             self.quant_method.method_name, tp_rank=self.tp_rank_, tp_world_size=self.tp_world_size_
         )
         self._create_weight()
-        PlatformAwareOp.__init__(self)
 
     def _create_weight(self):
         total_expert_num = self.n_routed_experts

From 2a70eae1b7293200a85f2b6de9ab2f66e5d58ca9 Mon Sep 17 00:00:00 2001
From: sufubao <sufubao@sensetime.com>
Date: Tue, 13 Jan 2026 03:50:55 +0000
Subject: [PATCH 14/65] fix model call

---
 .../layer_weights/transformer_layer_weight.py | 12 ++--
 .../layer_weights/transformer_layer_weight.py | 72 ++++++++++---------
 .../layer_weights/transformer_layer_weight.py | 25 +++----
 .../pre_and_post_layer_weight.py              |  1 +
 .../layer_weights/transformer_layer_weight.py | 15 ++--
 .../layer_weights/transformer_layer_weight.py |  5 +-
 .../layer_weights/transformer_layer_weight.py | 16 +++--
 .../layer_weights/transformer_layer_weight.py |  6 +-
 .../layer_weights/transformer_layer_weight.py | 12 ++--
 .../layer_weights/transformer_layer_weight.py | 12 ++--
 .../layer_weights/transformer_layer_weight.py | 32 +++++----
 11 files changed, 114 insertions(+), 94 deletions(-)

diff --git a/lightllm/models/bloom/layer_weights/transformer_layer_weight.py b/lightllm/models/bloom/layer_weights/transformer_layer_weight.py
index 599893655..568a9c038 100644
--- a/lightllm/models/bloom/layer_weights/transformer_layer_weight.py
+++ b/lightllm/models/bloom/layer_weights/transformer_layer_weight.py
@@ -108,18 +108,18 @@ def load_hf_weights(self, weights):
 
     def _init_ffn(self):
         self.gate_up_proj = ROWMMWeight(
+            in_dim=self.n_embed,
+            out_dims=[self.n_inter],
             weight_names=self._gate_up_weight_name,
             data_type=self.data_type_,
             bias_names=self._gate_up_bias_name,
-            quant_cfg=self.quant_cfg,
-            layer_num=self.layer_num_,
-            name="gate_up_proj",
+            quant_method=self.get_quant_method("gate_up_proj"),
         )
         self.down_proj = COLMMWeight(
+            in_dim=self.n_inter,
+            out_dims=[self.n_embed],
             weight_names=self._down_weight_name,
             data_type=self.data_type_,
             bias_names=self._down_bias_name,
-            quant_cfg=self.quant_cfg,
-            layer_num=self.layer_num_,
-            name="down_proj",
+            quant_method=self.get_quant_method("down_proj"),
         )
diff --git a/lightllm/models/deepseek2/layer_weights/transformer_layer_weight.py b/lightllm/models/deepseek2/layer_weights/transformer_layer_weight.py
index 6ff081eef..05897203a 100644
--- a/lightllm/models/deepseek2/layer_weights/transformer_layer_weight.py
+++ b/lightllm/models/deepseek2/layer_weights/transformer_layer_weight.py
@@ -43,6 +43,13 @@ def _parse_config(self):
             moe_mode = os.getenv("MOE_MODE", "TP")
             assert moe_mode == "TP"
             self.num_fused_shared_experts = self.network_config_.get("n_shared_experts", 0)
+        self.n_embed = self.network_config_["hidden_size"]
+        self.n_inter = self.network_config_["intermediate_size"]
+        self.moe_inter = self.network_config_.get("moe_intermediate_size", self.n_inter)
+        self.q_out_dim = self.num_attention_heads * (self.qk_nope_head_dim + self.qk_rope_head_dim)
+        self.kv_a_out_dim = self.kv_lora_rank + self.qk_rope_head_dim
+        self.kv_b_out_dim = self.num_attention_heads * (self.qk_nope_head_dim + self.v_head_dim)
+        self.o_in_dim = self.num_attention_heads * self.v_head_dim
 
     def _init_weight_names(self):
         if self.q_lora_rank is None:
@@ -140,40 +147,40 @@ def load_hf_weights(self, weights):
     def _init_qkvo(self):
         if self.q_lora_rank is None:
             self.q_weight_ = ROWMMWeight(
+                in_dim=self.n_embed,
+                out_dims=[self.q_out_dim],
                 weight_names=f"model.layers.{self.layer_num_}.self_attn.q_proj.weight",
                 data_type=self.data_type_,
-                quant_cfg=self.quant_cfg,
-                layer_num=self.layer_num_,
-                name="q_weight",
+                quant_method=self.get_quant_method("q_weight"),
             )
             self.kv_a_proj_with_mqa_ = ROWMMWeight(
+                in_dim=self.n_embed,
+                out_dims=[self.kv_a_out_dim],
                 weight_names=f"model.layers.{self.layer_num_}.self_attn.kv_a_proj_with_mqa.weight",
                 data_type=self.data_type_,
-                quant_cfg=self.quant_cfg,
-                layer_num=self.layer_num_,
-                name="kv_a_proj_with_mqa",
+                quant_method=self.get_quant_method("kv_a_proj_with_mqa"),
                 tp_rank=0,
                 tp_world_size=1,
             )
         else:
             self.qkv_a_proj_with_mqa_ = ROWMMWeight(
+                in_dim=self.n_embed,
+                out_dims=[self.q_lora_rank, self.kv_a_out_dim],
                 weight_names=[
                     f"model.layers.{self.layer_num_}.self_attn.q_a_proj.weight",
                     f"model.layers.{self.layer_num_}.self_attn.kv_a_proj_with_mqa.weight",
                 ],
                 data_type=self.data_type_,
-                quant_cfg=self.quant_cfg,
-                layer_num=self.layer_num_,
-                name="qkv_a_proj_with_mqa",
+                quant_method=self.get_quant_method("qkv_a_proj_with_mqa"),
                 tp_rank=0,
                 tp_world_size=1,
             )
             self.q_b_proj_ = ROWMMWeight(
+                in_dim=self.q_lora_rank,
+                out_dims=[self.q_out_dim],
                 weight_names=f"model.layers.{self.layer_num_}.self_attn.q_b_proj.weight",
                 data_type=self.data_type_,
-                quant_cfg=self.quant_cfg,
-                layer_num=self.layer_num_,
-                name="q_b_proj",
+                quant_method=self.get_quant_method("q_b_proj"),
             )
         # self.k_b_proj_ = ROWBMMWeight(
         #     weight_names=f"model.layers.{self.layer_num_}.self_attn.k_b_proj.weight",
@@ -191,65 +198,66 @@ def _init_qkvo(self):
         # )
         if self.enable_cc_method:
             self.cc_kv_b_proj_ = ROWMMWeight(
+                in_dim=self.kv_lora_rank,
+                out_dims=[self.kv_b_out_dim],
                 weight_names=f"model.layers.{self.layer_num_}.self_attn.kv_b_proj.weight",
                 data_type=self.data_type_,
-                quant_cfg=self.quant_cfg,
-                layer_num=self.layer_num_,
-                name="cc_kv_b_proj",
+                quant_method=self.get_quant_method("cc_kv_b_proj"),
             )
 
         self.o_weight_ = COLMMWeight(
+            in_dim=self.o_in_dim,
+            out_dims=[self.n_embed],
             weight_names=f"model.layers.{self.layer_num_}.self_attn.o_proj.weight",
             data_type=self.data_type_,
-            quant_cfg=self.quant_cfg,
-            layer_num=self.layer_num_,
-            name="o_weight",
+            quant_method=self.get_quant_method("o_weight"),
         )
 
     def _load_mlp(self, mlp_prefix):
         moe_mode = os.getenv("MOE_MODE", "TP")
         if self.is_moe and moe_mode == "EP":
             self.gate_up_proj = ROWMMWeight(
+                in_dim=self.n_embed,
+                out_dims=[self.moe_inter, self.moe_inter],
                 weight_names=[f"{mlp_prefix}.gate_proj.weight", f"{mlp_prefix}.up_proj.weight"],
                 data_type=self.data_type_,
-                quant_cfg=self.quant_cfg,
-                layer_num=self.layer_num_,
-                name="gate_up_proj",
+                quant_method=self.get_quant_method("gate_up_proj"),
                 tp_rank=0,
                 tp_world_size=1,
             )
             self.down_proj = COLMMWeight(
+                in_dim=self.moe_inter,
+                out_dims=[self.n_embed],
                 weight_names=f"{mlp_prefix}.down_proj.weight",
                 data_type=self.data_type_,
-                quant_cfg=self.quant_cfg,
-                layer_num=self.layer_num_,
-                name="down_proj",
+                quant_method=self.get_quant_method("down_proj"),
                 tp_rank=0,
                 tp_world_size=1,
             )
         else:
             self.gate_up_proj = ROWMMWeight(
+                in_dim=self.n_embed,
+                out_dims=[self.n_inter, self.n_inter],
                 weight_names=[f"{mlp_prefix}.gate_proj.weight", f"{mlp_prefix}.up_proj.weight"],
                 data_type=self.data_type_,
-                quant_cfg=self.quant_cfg,
-                layer_num=self.layer_num_,
-                name="gate_up_proj",
+                quant_method=self.get_quant_method("gate_up_proj"),
             )
             self.down_proj = COLMMWeight(
+                in_dim=self.n_inter,
+                out_dims=[self.n_embed],
                 weight_names=f"{mlp_prefix}.down_proj.weight",
                 data_type=self.data_type_,
-                quant_cfg=self.quant_cfg,
-                layer_num=self.layer_num_,
-                name="down_proj",
+                quant_method=self.get_quant_method("down_proj"),
             )
 
     def _init_moe(self):
         moe_intermediate_size = self.network_config_["moe_intermediate_size"]
         self.moe_gate = ROWMMWeight(
+            in_dim=self.n_embed,
+            out_dims=[self.n_routed_experts],
             weight_names=f"model.layers.{self.layer_num_}.mlp.gate.weight",
             data_type=self.data_type_,
-            layer_num=self.layer_num_,
-            name="moe_gate",
+            quant_method=self.get_quant_method("moe_gate"),
             tp_rank=0,
             tp_world_size=1,
         )
diff --git a/lightllm/models/gemma3/layer_weights/transformer_layer_weight.py b/lightllm/models/gemma3/layer_weights/transformer_layer_weight.py
index 11e3c2f36..a4340a17a 100644
--- a/lightllm/models/gemma3/layer_weights/transformer_layer_weight.py
+++ b/lightllm/models/gemma3/layer_weights/transformer_layer_weight.py
@@ -25,39 +25,40 @@ def _init_weight_names(self):
 
     def _init_ffn(self):
         self.gate_proj = ROWMMWeight(
+            in_dim=self.n_embed,
+            out_dims=[self.n_inter],
             weight_names=self._gate_weight_name,
             data_type=self.data_type_,
             bias_names=self._gate_bias_name,
-            quant_cfg=self.quant_cfg,
-            layer_num=self.layer_num_,
-            name="gate_proj",
+            quant_method=self.get_quant_method("gate_proj"),
         )
         self.up_proj = ROWMMWeight(
+            in_dim=self.n_embed,
+            out_dims=[self.n_inter],
             weight_names=self._up_weight_name,
             data_type=self.data_type_,
             bias_names=self._up_bias_name,
-            quant_cfg=self.quant_cfg,
-            layer_num=self.layer_num_,
-            name="up_proj",
+            quant_method=self.get_quant_method("up_proj"),
         )
         super()._init_ffn()
 
     def _init_qkv(self):
+        kv_out_dim = self.k_head_num_ * self.head_dim
         self.k_proj = ROWMMWeight(
+            in_dim=self.n_embed,
+            out_dims=[kv_out_dim],
             weight_names=self._k_weight_name,
             data_type=self.data_type_,
             bias_names=self._k_bias_name,
-            quant_cfg=self.quant_cfg,
-            layer_num=self.layer_num_,
-            name="k_proj",
+            quant_method=self.get_quant_method("k_proj"),
         )
         self.v_proj = ROWMMWeight(
+            in_dim=self.n_embed,
+            out_dims=[kv_out_dim],
             weight_names=self._v_weight_name,
             data_type=self.data_type_,
             bias_names=self._v_bias_name,
-            quant_cfg=self.quant_cfg,
-            layer_num=self.layer_num_,
-            name="v_proj",
+            quant_method=self.get_quant_method("v_proj"),
         )
         super()._init_qkv()
 
diff --git a/lightllm/models/gemma_2b/layer_weights/pre_and_post_layer_weight.py b/lightllm/models/gemma_2b/layer_weights/pre_and_post_layer_weight.py
index fbfb2ee75..23ae50f09 100644
--- a/lightllm/models/gemma_2b/layer_weights/pre_and_post_layer_weight.py
+++ b/lightllm/models/gemma_2b/layer_weights/pre_and_post_layer_weight.py
@@ -17,6 +17,7 @@ def __init__(self, data_type, network_config):
         self.lm_head_weight_ = self.wte_weight_
 
         self.final_norm_weight_ = NoTpGEMMANormWeight(
+            dim=hidden_size,
             weight_name="model.norm.weight",
             data_type=self.data_type_,
             bias_name=None,
diff --git a/lightllm/models/gemma_2b/layer_weights/transformer_layer_weight.py b/lightllm/models/gemma_2b/layer_weights/transformer_layer_weight.py
index 87b2fb744..19951b990 100644
--- a/lightllm/models/gemma_2b/layer_weights/transformer_layer_weight.py
+++ b/lightllm/models/gemma_2b/layer_weights/transformer_layer_weight.py
@@ -11,21 +11,24 @@ def __init__(self, layer_num, data_type, network_config, quant_cfg=None):
         return
 
     def _init_qkv(self):
+        in_dim = self.n_embed
+        q_out_dim = self.q_head_num_ * self.head_dim
+        kv_out_dim = self.k_head_num_ * self.head_dim
         self.q_proj = ROWMMWeight(
+            in_dim=in_dim,
+            out_dims=[q_out_dim],
             weight_names=self._q_weight_name,
             data_type=self.data_type_,
             bias_names=self._q_bias_name,
-            quant_cfg=self.quant_cfg,
-            layer_num=self.layer_num_,
-            name="q_proj",
+            quant_method=self.get_quant_method("q_proj"),
         )
         self.kv_proj = ROWMMWeight(
+            in_dim=in_dim,
+            out_dims=[kv_out_dim, kv_out_dim],
             weight_names=[self._k_weight_name, self._v_weight_name],
             data_type=self.data_type_,
             bias_names=[self._k_bias_name, self._v_bias_name],
-            quant_cfg=self.quant_cfg,
-            layer_num=self.layer_num_,
-            name="kv_proj",
+            quant_method=self.get_quant_method("kv_proj"),
         )
 
     def _init_norm(self):
diff --git a/lightllm/models/gpt_oss/layer_weights/transformer_layer_weight.py b/lightllm/models/gpt_oss/layer_weights/transformer_layer_weight.py
index 5fb85aa1c..0e7f4c873 100644
--- a/lightllm/models/gpt_oss/layer_weights/transformer_layer_weight.py
+++ b/lightllm/models/gpt_oss/layer_weights/transformer_layer_weight.py
@@ -31,11 +31,12 @@ def _init_moe(self):
         assert moe_mode in ["TP"], "For now, GPT-OSS type model only support MOE TP mode."
 
         self.moe_gate = ROWMMWeight(
+            in_dim=self.n_embed,
+            out_dims=[n_routed_experts],
             weight_names=self._router_weight_name,
             data_type=self.data_type_,
-            layer_num=self.layer_num_,
             bias_names=self._router_bias_name,
-            name="moe_gate",
+            quant_method=self.get_quant_method("moe_gate"),
             tp_rank=0,
             tp_world_size=1,
         )
diff --git a/lightllm/models/mistral_mtp/layer_weights/transformer_layer_weight.py b/lightllm/models/mistral_mtp/layer_weights/transformer_layer_weight.py
index b58e58799..2cbc6cf58 100644
--- a/lightllm/models/mistral_mtp/layer_weights/transformer_layer_weight.py
+++ b/lightllm/models/mistral_mtp/layer_weights/transformer_layer_weight.py
@@ -7,6 +7,10 @@ def __init__(self, layer_num, data_type, network_config, quant_cfg=None):
         super().__init__(layer_num, data_type, network_config, quant_cfg)
         return
 
+    def _parse_config(self):
+        self.n_embed = self.network_config_["hidden_size"]
+        self.n_inter = self.network_config_["intermediate_size"]
+
     def _init_weight_names(self):
         self._gate_weight_name = f"mtp.layers.{self.layer_num_}.mlp.gate_proj.weight"
         self._gate_bias_name = None
@@ -24,20 +28,20 @@ def _init_weight(self):
 
     def _init_ffn(self):
         self.gate_up_proj = ROWMMWeight(
+            in_dim=self.n_embed,
+            out_dims=[self.n_inter, self.n_inter],
             weight_names=[self._gate_weight_name, self._up_weight_name],
             data_type=self.data_type_,
             bias_names=[self._gate_bias_name, self._up_bias_name],
-            quant_cfg=self.quant_cfg,
-            layer_num=self.layer_num_,
-            name="gate_up_proj",
+            quant_method=self.get_quant_method("gate_up_proj"),
         )
         self.down_proj = COLMMWeight(
+            in_dim=self.n_inter,
+            out_dims=[self.n_embed],
             weight_names=self._down_weight_name,
             data_type=self.data_type_,
             bias_names=self._down_bias_name,
-            quant_cfg=self.quant_cfg,
-            layer_num=self.layer_num_,
-            name="down_proj",
+            quant_method=self.get_quant_method("down_proj"),
         )
 
     def _init_norm(self):
diff --git a/lightllm/models/mixtral/layer_weights/transformer_layer_weight.py b/lightllm/models/mixtral/layer_weights/transformer_layer_weight.py
index cc125f926..fa20a63f9 100644
--- a/lightllm/models/mixtral/layer_weights/transformer_layer_weight.py
+++ b/lightllm/models/mixtral/layer_weights/transformer_layer_weight.py
@@ -34,12 +34,12 @@ def _init_moe(self):
         split_inter_size = inter_size // self.tp_world_size_
 
         self.moe_gate = ROWMMWeight(
+            in_dim=self.n_embed,
+            out_dims=[self.n_routed_experts],
             weight_names=self.moe_gate_weight_name,
             data_type=self.data_type_,
             bias_names=self.moe_gate_bias_name,
-            quant_cfg=self.quant_cfg,
-            layer_num=self.layer_num_,
-            name="moe_gate",
+            quant_method=self.get_quant_method("moe_gate"),
             tp_rank=0,
             tp_world_size=1,  # no tensor parallelism
         )
diff --git a/lightllm/models/starcoder/layer_weights/transformer_layer_weight.py b/lightllm/models/starcoder/layer_weights/transformer_layer_weight.py
index 41f24f79c..4adbe4f5e 100644
--- a/lightllm/models/starcoder/layer_weights/transformer_layer_weight.py
+++ b/lightllm/models/starcoder/layer_weights/transformer_layer_weight.py
@@ -51,18 +51,18 @@ def _init_weight_names(self):
 
     def _init_ffn(self):
         self.gate_up_proj = ROWMMWeight(
+            in_dim=self.n_embed,
+            out_dims=[self.n_inter],
             weight_names=self._gate_up_weight_name,
             data_type=self.data_type_,
             bias_names=self._gate_up_bias_name,
-            quant_cfg=self.quant_cfg,
-            layer_num=self.layer_num_,
-            name="gate_up_proj",
+            quant_method=self.get_quant_method("gate_up_proj"),
         )
         self.down_proj = COLMMWeight(
+            in_dim=self.n_inter,
+            out_dims=[self.n_embed],
             weight_names=self._down_weight_name,
             data_type=self.data_type_,
             bias_names=self._down_bias_name,
-            quant_cfg=self.quant_cfg,
-            layer_num=self.layer_num_,
-            name="down_proj",
+            quant_method=self.get_quant_method("down_proj"),
         )
diff --git a/lightllm/models/starcoder2/layer_weights/transformer_layer_weight.py b/lightllm/models/starcoder2/layer_weights/transformer_layer_weight.py
index 53342e221..7370c6953 100644
--- a/lightllm/models/starcoder2/layer_weights/transformer_layer_weight.py
+++ b/lightllm/models/starcoder2/layer_weights/transformer_layer_weight.py
@@ -28,18 +28,18 @@ def _init_weight_names(self):
 
     def _init_ffn(self):
         self.up_proj = ROWMMWeight(
+            in_dim=self.n_embed,
+            out_dims=[self.n_inter],
             weight_names=self._up_weight_name,
             data_type=self.data_type_,
             bias_names=self._up_bias_name,
-            quant_cfg=self.quant_cfg,
-            layer_num=self.layer_num_,
-            name="up_proj",
+            quant_method=self.get_quant_method("up_proj"),
         )
         self.down_proj = COLMMWeight(
+            in_dim=self.n_inter,
+            out_dims=[self.n_embed],
             weight_names=self._down_weight_name,
             data_type=self.data_type_,
             bias_names=self._down_bias_name,
-            quant_cfg=self.quant_cfg,
-            layer_num=self.layer_num_,
-            name="down_proj",
+            quant_method=self.get_quant_method("down_proj"),
         )
diff --git a/lightllm/models/vit/layer_weights/transformer_layer_weight.py b/lightllm/models/vit/layer_weights/transformer_layer_weight.py
index 5a7a24a9a..8bcbe3358 100644
--- a/lightllm/models/vit/layer_weights/transformer_layer_weight.py
+++ b/lightllm/models/vit/layer_weights/transformer_layer_weight.py
@@ -31,6 +31,9 @@ def _parse_config(self):
         self.qkv_bias = self.network_config_.get("qkv_bias", True)
         self.layer_norm_eps = self.network_config_.get("layer_norm_eps", 1e-6)
         self.norm_type = self.network_config_.get("norm_type", "layer_norm")
+        self.n_embed = self.network_config_["hidden_size"] + self.padding_hidden_size
+        mlp_ratio = self.network_config_.get("mlp_ratio", 4)
+        self.n_inter = self.network_config_.get("intermediate_size", int(self.n_embed * mlp_ratio))
 
     def _init_weight_names(self):
         self._att_norm_weight_name = f"vision_model.encoder.layers.{self.layer_num_}.norm1.weight"
@@ -83,41 +86,41 @@ def _init_weight(self):
 
     def _init_qkv(self):
         self.qkv_proj = ROWMMWeight(
+            in_dim=self.n_embed,
+            out_dims=[self.n_embed, self.n_embed, self.n_embed],
             weight_names=[self._q_weight_name, self._k_weight_name, self._v_weight_name],
             data_type=self.data_type_,
             bias_names=[self._q_bias_name, self._k_bias_name, self._v_bias_name],
-            quant_cfg=self.quant_cfg,
-            layer_num=self.layer_num_,
-            name="qkv_proj",
+            quant_method=self.get_quant_method("qkv_proj"),
         )
 
     def _init_o(self):
         self.o_proj = COLMMWeight(
+            in_dim=self.n_embed,
+            out_dims=[self.n_embed],
             weight_names=self._o_weight_name,
             data_type=self.data_type_,
             bias_names=self._o_bias_name,
-            quant_cfg=self.quant_cfg,
-            layer_num=self.layer_num_,
-            name="o_proj",
+            quant_method=self.get_quant_method("o_proj"),
         )
 
     def _init_ffn(self):
         self.ffn_1_proj_ = ROWMMWeight(
+            in_dim=self.n_embed,
+            out_dims=[self.n_inter],
             weight_names=self.fc1_weight_name_,
             data_type=self.data_type_,
             bias_names=self.fc1_bias_name_,
-            quant_cfg=self.quant_cfg,
-            layer_num=self.layer_num_,
-            name="ffn_1_proj",
+            quant_method=self.get_quant_method("ffn_1_proj"),
         )
 
         self.ffn_2_proj_ = COLMMWeight(
+            in_dim=self.n_inter,
+            out_dims=[self.n_embed],
             weight_names=self.fc2_weight_name_,
             data_type=self.data_type_,
             bias_names=self.fc2_bias_name_,
-            quant_cfg=self.quant_cfg,
-            layer_num=self.layer_num_,
-            name="ffn_2_proj",
+            quant_method=self.get_quant_method("ffn_2_proj"),
         )
 
     def _init_norm(self):
@@ -136,18 +139,17 @@ def _init_norm(self):
             bias_name=self._ffn_norm_bias_name,
         )
         if self.qk_norm:
-            head_num = self.network_config_["num_attention_heads"]
             self.q_norm_weight_ = TpRMSNormWeight(
                 dim=hidden_size,
                 weight_name=self._q_norm_weight_name,
                 data_type=self.data_type_,
-                head_num=head_num,
+                bias_name=None,
             )
             self.k_norm_weight_ = TpRMSNormWeight(
                 dim=hidden_size,
                 weight_name=self._k_norm_weight_name,
                 data_type=self.data_type_,
-                head_num=head_num,
+                bias_name=None,
             )
 
     def load_hf_weights(self, weights):

From 0e17bf66b1d249082eadda6b35576086f20190df Mon Sep 17 00:00:00 2001
From: sufubao <sufubao@sensetime.com>
Date: Tue, 13 Jan 2026 05:28:30 +0000
Subject: [PATCH 15/65] remove torchao

---
 docs/CN/source/tutorial/api_server_args.rst   |  19 +-
 docs/EN/source/tutorial/api_server_args.rst   |  19 +-
 lightllm/common/quantization/__init__.py      |   1 -
 lightllm/common/quantization/torchao_quant.py | 168 ------------------
 lightllm/server/api_cli.py                    |  10 +-
 5 files changed, 13 insertions(+), 204 deletions(-)
 delete mode 100644 lightllm/common/quantization/torchao_quant.py

diff --git a/docs/CN/source/tutorial/api_server_args.rst b/docs/CN/source/tutorial/api_server_args.rst
index b7f6312a6..e86929a89 100644
--- a/docs/CN/source/tutorial/api_server_args.rst
+++ b/docs/CN/source/tutorial/api_server_args.rst
@@ -367,17 +367,14 @@ PD 分离模式参数
 .. option:: --quant_type
 
     量化方法，可选值：
-    
-    * ``ppl-w4a16-128``
-    * ``flashllm-w6a16``
-    * ``ao-int4wo-[32,64,128,256]``
-    * ``ao-int8wo``
-    * ``ao-fp8w8a16``
-    * ``ao-fp6w6a16``
+
     * ``vllm-w8a8``
     * ``vllm-fp8w8a8``
     * ``vllm-fp8w8a8-b128``
+    * ``deepgemm-fp8w8a8-b128``
     * ``triton-fp8w8a8-block128``
+    * ``awq``
+    * ``awq_marlin``
     * ``none`` (默认)
 
 .. option:: --quant_cfg
@@ -389,13 +386,7 @@ PD 分离模式参数
 .. option:: --vit_quant_type
 
     ViT 量化方法，可选值：
-    
-    * ``ppl-w4a16-128``
-    * ``flashllm-w6a16``
-    * ``ao-int4wo-[32,64,128,256]``
-    * ``ao-int8wo``
-    * ``ao-fp8w8a16``
-    * ``ao-fp6w6a16``
+
     * ``vllm-w8a8``
     * ``vllm-fp8w8a8``
     * ``none`` (默认)
diff --git a/docs/EN/source/tutorial/api_server_args.rst b/docs/EN/source/tutorial/api_server_args.rst
index 18fe54c55..73cf12513 100644
--- a/docs/EN/source/tutorial/api_server_args.rst
+++ b/docs/EN/source/tutorial/api_server_args.rst
@@ -359,17 +359,14 @@ Quantization Parameters
 .. option:: --quant_type
 
     Quantization method, optional values:
-    
-    * ``ppl-w4a16-128``
-    * ``flashllm-w6a16``
-    * ``ao-int4wo-[32,64,128,256]``
-    * ``ao-int8wo``
-    * ``ao-fp8w8a16``
-    * ``ao-fp6w6a16``
+
     * ``vllm-w8a8``
     * ``vllm-fp8w8a8``
     * ``vllm-fp8w8a8-b128``
+    * ``deepgemm-fp8w8a8-b128``
     * ``triton-fp8w8a8-block128``
+    * ``awq``
+    * ``awq_marlin``
     * ``none`` (default)
 
 .. option:: --quant_cfg
@@ -381,13 +378,7 @@ Quantization Parameters
 .. option:: --vit_quant_type
 
     ViT quantization method, optional values:
-    
-    * ``ppl-w4a16-128``
-    * ``flashllm-w6a16``
-    * ``ao-int4wo-[32,64,128,256]``
-    * ``ao-int8wo``
-    * ``ao-fp8w8a16``
-    * ``ao-fp6w6a16``
+
     * ``vllm-w8a8``
     * ``vllm-fp8w8a8``
     * ``none`` (default)
diff --git a/lightllm/common/quantization/__init__.py b/lightllm/common/quantization/__init__.py
index ecf2e6d42..af1327cd8 100644
--- a/lightllm/common/quantization/__init__.py
+++ b/lightllm/common/quantization/__init__.py
@@ -1,7 +1,6 @@
 import yaml
 import collections
 from .registry import QUANTMETHODS
-from .torchao_quant import *
 from .w8a8_quant import *
 from .triton_quant.triton_quant import *
 from .deepgemm_quant import *
diff --git a/lightllm/common/quantization/torchao_quant.py b/lightllm/common/quantization/torchao_quant.py
deleted file mode 100644
index d1db65b35..000000000
--- a/lightllm/common/quantization/torchao_quant.py
+++ /dev/null
@@ -1,168 +0,0 @@
-import os
-import torch
-from .quantize_method import QuantizationMethod
-from .registry import QUANTMETHODS
-import torch.nn.functional as F
-from typing import TYPE_CHECKING, Optional
-
-from .quantize_method import WeightPack
-
-try:
-    HAS_TORCH_AO = True
-    from torchao.quantization import (
-        int4_weight_only,
-        int8_weight_only,
-        float8_weight_only,
-        fpx_weight_only,
-        int8_dynamic_activation_int8_weight,
-        float8_dynamic_activation_float8_weight,
-        quantize_,
-    )
-    from torchao.utils import (
-        TORCH_VERSION_AT_LEAST_2_4,
-        TORCH_VERSION_AT_LEAST_2_5,
-    )
-except:
-    HAS_TORCH_AO = False
-
-
-class AOBaseQuantizationMethod(QuantizationMethod):
-    def __init__(self):
-        super().__init__()
-        assert HAS_TORCH_AO, "torchao is not installed, you can't use quant api of it"
-        assert TORCH_VERSION_AT_LEAST_2_4, "torchao requires torch >=2.4"
-        self.quant_func = None
-
-    def quantize(self, weight: torch.Tensor, offset: int = 0) -> WeightPack:
-        """ """
-        dummy_linear = torch.nn.Linear(weight.shape[1], weight.shape[0], bias=False)
-        dummy_linear.weight = torch.nn.Parameter(weight.cuda(self.device_id_))
-        quantize_(dummy_linear, self.quant_func)
-        return WeightPack(weight=dummy_linear.weight, weight_scale=None, weight_zero_point=None)
-
-    def apply(
-        self,
-        input_tensor: torch.Tensor,
-        weight_pack: WeightPack,
-        out: Optional[torch.Tensor] = None,
-        workspace: Optional[torch.Tensor] = None,
-        use_custom_tensor_mananger: bool = True,
-    ) -> torch.Tensor:
-        weights = weight_pack.weight
-        bias = weight_pack.bias
-        return F.linear(input_tensor, weights, bias)
-
-    @property
-    def method_name(self):
-        return "ao-base"
-
-
-@QUANTMETHODS.register(["ao-w4a16-256"])
-class AOW4A16QuantizationMethodGroup256(AOBaseQuantizationMethod):
-    def __init__(self):
-        super().__init__()
-        self.group_size = 256
-        self.quant_func = int4_weight_only(group_size=self.group_size)
-        self.has_weight_scale = False
-        self.has_weight_zero_point = False
-
-    @property
-    def method_name(self):
-        return "ao-w4a16-256"
-
-
-@QUANTMETHODS.register(["ao-w4a16-128"])
-class AOW4A16QuantizationMethodGroup128(AOBaseQuantizationMethod):
-    def __init__(self):
-        super().__init__()
-        self.group_size = 128
-        self.quant_func = int4_weight_only(group_size=self.group_size)
-        self.has_weight_scale = False
-        self.has_weight_zero_point = False
-
-    @property
-    def method_name(self):
-        return "ao-w4a16-128"
-
-
-@QUANTMETHODS.register(["ao-w4a16-64"])
-class AOW4A16QuantizationMethodGroup64(AOBaseQuantizationMethod):
-    def __init__(self):
-        super().__init__()
-        self.group_size = 64
-        self.quant_func = int4_weight_only(group_size=self.group_size)
-        self.has_weight_scale = False
-        self.has_weight_zero_point = False
-
-    @property
-    def method_name(self):
-        return "ao-w4a16-64"
-
-
-@QUANTMETHODS.register(["ao-w4a16-32"])
-class AOW4A16QuantizationMethodGroup32(AOBaseQuantizationMethod):
-    def __init__(self):
-        super().__init__()
-        self.group_size = 32
-        self.quant_func = int4_weight_only(group_size=self.group_size)
-        self.has_weight_scale = False
-        self.has_weight_zero_point = False
-
-    @property
-    def method_name(self):
-        return "ao-w4a16-32"
-
-
-@QUANTMETHODS.register("ao-w8a8")
-class AOW8A8QuantizationMethod(AOBaseQuantizationMethod):
-    def __init__(self):
-        super().__init__()
-        self.quant_func = int8_dynamic_activation_int8_weight()
-        self.has_weight_scale = False
-        self.has_weight_zero_point = False
-
-    @property
-    def method_name(self):
-        return "ao-w8a8"
-
-
-@QUANTMETHODS.register("ao-w8a16")
-class AOW8A16QuantizationMethod(AOBaseQuantizationMethod):
-    def __init__(self):
-        super().__init__()
-        self.quant_func = int8_weight_only()
-        self.has_weight_scale = False
-        self.has_weight_zero_point = False
-
-    @property
-    def method_name(self):
-        return "ao-w8a16"
-
-
-@QUANTMETHODS.register("ao-fp8w8a16")
-class AOFP8W8A16QuantizationMethod(AOBaseQuantizationMethod):
-    def __init__(self):
-        super().__init__()
-        is_cuda_8_9 = torch.cuda.is_available() and torch.cuda.get_device_capability() >= (8, 9)
-        assert is_cuda_8_9, "FP8 requires GPU with compute capability >= 8.9"
-        self.quant_func = float8_weight_only()
-        self.has_weight_scale = False
-        self.has_weight_zero_point = False
-
-    @property
-    def method_name(self):
-        return "ao-fp8w8a16"
-
-
-@QUANTMETHODS.register("ao-fp6w6a16")
-class AOFP6W6A16QuantizationMethod(AOBaseQuantizationMethod):
-    def __init__(self):
-        super().__init__()
-        assert TORCH_VERSION_AT_LEAST_2_5, "torchao fp6 requires torch >=2.5"
-        self.quant_func = fpx_weight_only(3, 2)
-        self.has_weight_scale = False
-        self.has_weight_zero_point = False
-
-    @property
-    def method_name(self):
-        return "ao-fp6w6a16"
diff --git a/lightllm/server/api_cli.py b/lightllm/server/api_cli.py
index ac883b1a4..05da4ccb3 100644
--- a/lightllm/server/api_cli.py
+++ b/lightllm/server/api_cli.py
@@ -465,10 +465,8 @@ def make_argument_parser() -> argparse.ArgumentParser:
         "--quant_type",
         type=str,
         default="none",
-        help="""Quantization method: ppl-w4a16-128 | flashllm-w6a16
-                        | ao-int4wo-[32,64,128,256] | ao-int8wo | ao-fp8w8a16 | ao-fp6w6a16
-                        | vllm-w8a8 | vllm-fp8w8a8 | vllm-fp8w8a8-b128
-                        | triton-fp8w8a8-block128""",
+        help="""Quantization method: vllm-w8a8 | vllm-fp8w8a8 | vllm-fp8w8a8-b128
+                        | deepgemm-fp8w8a8-b128 | triton-fp8w8a8-block128 | awq | awq_marlin""",
     )
     parser.add_argument(
         "--quant_cfg",
@@ -481,9 +479,7 @@ def make_argument_parser() -> argparse.ArgumentParser:
         "--vit_quant_type",
         type=str,
         default="none",
-        help="""Quantization method: ppl-w4a16-128 | flashllm-w6a16
-                        | ao-int4wo-[32,64,128,256] | ao-int8wo | ao-fp8w8a16 | ao-fp6w6a16
-                        | vllm-w8a8 | vllm-fp8w8a8""",
+        help="""Quantization method for ViT: vllm-w8a8 | vllm-fp8w8a8""",
     )
     parser.add_argument(
         "--vit_quant_cfg",

From b7393ab41053b1205db4322c74394fd3f0070619 Mon Sep 17 00:00:00 2001
From: sufubao <sufubao@sensetime.com>
Date: Tue, 13 Jan 2026 12:20:53 +0000
Subject: [PATCH 16/65] quantization draft

---
 lightllm/common/quantization/__init__.py      |  41 ++-
 lightllm/common/quantization/backend.py       |  82 +++++
 .../common/quantization/deepgemm_quant.py     | 136 -------
 .../quantization/triton_quant/triton_quant.py | 112 ------
 .../common/quantization/types/__init__.py     |  13 +
 .../{awq_quant.py => types/awq.py}            | 332 ++++++++++--------
 .../common/quantization/types/fp8_block128.py | 216 ++++++++++++
 .../quantization/types/fp8_per_token.py       | 172 +++++++++
 .../quantization/{ => types}/no_quant.py      |   7 +-
 lightllm/common/quantization/types/w8a8.py    | 108 ++++++
 lightllm/common/quantization/w8a8_quant.py    | 253 -------------
 11 files changed, 801 insertions(+), 671 deletions(-)
 create mode 100644 lightllm/common/quantization/backend.py
 delete mode 100644 lightllm/common/quantization/deepgemm_quant.py
 delete mode 100644 lightllm/common/quantization/triton_quant/triton_quant.py
 create mode 100644 lightllm/common/quantization/types/__init__.py
 rename lightllm/common/quantization/{awq_quant.py => types/awq.py} (62%)
 create mode 100644 lightllm/common/quantization/types/fp8_block128.py
 create mode 100644 lightllm/common/quantization/types/fp8_per_token.py
 rename lightllm/common/quantization/{ => types}/no_quant.py (90%)
 create mode 100644 lightllm/common/quantization/types/w8a8.py
 delete mode 100644 lightllm/common/quantization/w8a8_quant.py

diff --git a/lightllm/common/quantization/__init__.py b/lightllm/common/quantization/__init__.py
index af1327cd8..d5289298c 100644
--- a/lightllm/common/quantization/__init__.py
+++ b/lightllm/common/quantization/__init__.py
@@ -1,13 +1,21 @@
 import yaml
 import collections
 from .registry import QUANTMETHODS
-from .w8a8_quant import *
-from .triton_quant.triton_quant import *
-from .deepgemm_quant import *
-from .awq_quant import *
-from .no_quant import *
+from .backend import QUANT_BACKEND
 from lightllm.utils.log_utils import init_logger
 
+# Import all type classes (they auto-register with QUANTMETHODS)
+from .types import (
+    NoQuantization,
+    FP8Block128Quantization,
+    FP8PerTokenQuantization,
+    W8A8Quantization,
+    AWQQuantization,
+)
+
+# Re-export for backwards compatibility
+from .types.awq import is_awq_marlin_compatible
+
 logger = init_logger(__name__)
 
 
@@ -37,20 +45,21 @@ def _mapping_quant_method(self):
         if self.hf_quantization_method == "fp8":
             block_size = self.hf_quantization_config.get("weight_block_size", None)
             if block_size == [128, 128]:
-                from lightllm.common.quantization.deepgemm_quant import HAS_DEEPGEMM
-
-                if HAS_DEEPGEMM:
-                    self.quant_type = "deepgemm-fp8w8a8-b128"
-                else:
-                    self.quant_type = "vllm-fp8w8a8-b128"
-                logger.info(f"select fp8w8a8-b128 quant way: {self.quant_type}")
+                self.quant_type = "fp8-block128"
+                logger.info(
+                    f"Selected quant type: fp8-block128, backend: {QUANT_BACKEND.get_backend('fp8-block128').name}"
+                )
+            else:
+                self.quant_type = "fp8-per-token"
+                logger.info(
+                    f"Selected quant type: fp8-per-token, backend: {QUANT_BACKEND.get_backend('fp8-per-token').name}"
+                )
         elif self.hf_quantization_method == "awq":
             self.quant_type = "awq"
-            if is_awq_marlin_compatible(self.hf_quantization_config):
-                self.quant_type = "awq_marlin"
-            logger.info(f"select awq quant way: {self.quant_type}")
+            logger.info("Selected quant type: awq (marlin auto-selected if compatible)")
         else:
-            # TODO: more quant method
+            # TODO: more quant methods
+            raise NotImplementedError(f"Quant method {self.hf_quantization_method} not implemented yet.")
             pass
 
     def _parse_custom_cfg(self, custom_cfg_path):
diff --git a/lightllm/common/quantization/backend.py b/lightllm/common/quantization/backend.py
new file mode 100644
index 000000000..e6d081ec2
--- /dev/null
+++ b/lightllm/common/quantization/backend.py
@@ -0,0 +1,82 @@
+import os
+from enum import Enum, auto
+from lightllm.utils.log_utils import init_logger
+
+logger = init_logger(__name__)
+
+
+class BackendType(Enum):
+    TRITON = auto()
+    VLLM = auto()
+    DEEPGEMM = auto()
+
+
+class BackendRegistry:
+    _instance = None
+
+    def __new__(cls):
+        if cls._instance is None:
+            cls._instance = super().__new__(cls)
+            cls._instance._initialized = False
+        return cls._instance
+
+    def __init__(self):
+        if self._initialized:
+            return
+        self._initialized = True
+
+        self._force_triton = os.getenv("LIGHTLLM_USE_TRITON_QUANT", "0").upper() in ["1", "TRUE", "ON"]
+
+        self._has_vllm = self._check_vllm()
+        self._has_deepgemm = self._check_deepgemm()
+
+        if self._force_triton:
+            logger.info("LIGHTLLM_USE_TRITON_QUANT is set, forcing Triton backend for quantization")
+        else:
+            logger.info(f"Available quantization backends: vLLM={self._has_vllm}, DeepGEMM={self._has_deepgemm}")
+
+    def _check_vllm(self) -> bool:
+        try:
+            from lightllm.utils.vllm_utils import HAS_VLLM
+
+            return HAS_VLLM
+        except ImportError:
+            return False
+
+    def _check_deepgemm(self) -> bool:
+        try:
+            import deep_gemm  # noqa: F401
+
+            return True
+        except ImportError:
+            return False
+
+    @property
+    def force_triton(self) -> bool:
+        return self._force_triton
+
+    @property
+    def has_vllm(self) -> bool:
+        return self._has_vllm
+
+    @property
+    def has_deepgemm(self) -> bool:
+        return self._has_deepgemm
+
+    def get_backend(self, quant_type: str) -> BackendType:
+        if self._force_triton:
+            return BackendType.TRITON
+
+        if quant_type == "fp8-block128":
+            if self._has_deepgemm:
+                return BackendType.DEEPGEMM
+            elif self._has_vllm:
+                return BackendType.VLLM
+        elif quant_type in ["w8a8", "fp8-per-token"]:
+            if self._has_vllm:
+                return BackendType.VLLM
+
+        return BackendType.TRITON
+
+
+QUANT_BACKEND = BackendRegistry()
diff --git a/lightllm/common/quantization/deepgemm_quant.py b/lightllm/common/quantization/deepgemm_quant.py
deleted file mode 100644
index 86dd9b572..000000000
--- a/lightllm/common/quantization/deepgemm_quant.py
+++ /dev/null
@@ -1,136 +0,0 @@
-import os
-import torch
-from torch.types import Device
-from .quantize_method import QuantizationMethod
-from .registry import QUANTMETHODS
-import torch.nn.functional as F
-from lightllm.common.quantization.triton_quant.fp8.fp8act_quant_kernel import (
-    per_token_group_quant_fp8,
-    tma_align_input_scale,
-)
-from typing import TYPE_CHECKING, Optional
-
-from .quantize_method import WeightPack
-
-try:
-    HAS_DEEPGEMM = True
-    import deep_gemm
-except:
-    HAS_DEEPGEMM = False
-
-
-class DeepGEMMBaseQuantizationMethod(QuantizationMethod):
-    def __init__(self):
-        super().__init__()
-        from lightllm.common.basemodel.layer_infer.cache_tensor_manager import g_cache_manager
-
-        self.cache_manager = g_cache_manager
-        assert HAS_DEEPGEMM, "deepgemm is not installed, you can't use quant api of it"
-
-    def quantize(self, weight: torch.Tensor, output: WeightPack, offset: int = 0):
-        raise NotImplementedError("Not implemented")
-
-    def apply(
-        self,
-        input_tensor: torch.Tensor,
-        weight_pack: WeightPack,
-        out: Optional[torch.Tensor] = None,
-        workspace: Optional[torch.Tensor] = None,
-        use_custom_tensor_mananger: bool = True,
-        bias: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
-        raise NotImplementedError("Not implemented")
-
-    @property
-    def method_name(self):
-        return "deepgemm-base"
-
-
-@QUANTMETHODS.register(["deepgemm-fp8w8a8-b128"])
-class DeepGEMMFP8w8a8B128QuantizationMethod(DeepGEMMBaseQuantizationMethod):
-    def __init__(self):
-        super().__init__()
-        self.block_size = 128
-        self.weight_suffix = None
-        self.weight_zero_point_suffix = None
-        self.weight_scale_suffix = "weight_scale_inv"
-        self.has_weight_scale = True
-        self.has_weight_zero_point = False
-
-    @property
-    def method_name(self):
-        return "deepgemm-fp8w8a8-b128"
-
-    def quantize(self, weight: torch.Tensor, output: WeightPack, offset: int = 0):
-        from lightllm.common.quantization.triton_quant.fp8.fp8w8a8_block_quant_kernel import weight_quant
-
-        device = output.weight.device
-        weight, scale = weight_quant(weight.cuda(device), self.block_size)
-        output.weight[offset : offset + weight.shape[0], :].copy_(weight)
-        output.weight_scale[offset // self.block_size : offset + weight.shape[0] // self.block_size].copy_(scale)
-        return
-
-    def apply(
-        self,
-        input_tensor: torch.Tensor,
-        weight_pack: "WeightPack",
-        out: Optional[torch.Tensor] = None,
-        workspace: Optional[torch.Tensor] = None,
-        use_custom_tensor_mananger: bool = True,
-        bias: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
-        qweight = weight_pack.weight
-        weight_scale = weight_pack.weight_scale
-        input_scale = None
-        alloc_func = torch.empty if not use_custom_tensor_mananger else self.cache_manager.empty
-        m, k = input_tensor.shape
-        n = qweight.shape[0]
-        if input_scale is None:
-            qinput_tensor, input_scale = per_token_group_quant_fp8(
-                input_tensor,
-                self.block_size,
-                dtype=qweight.dtype,
-                column_major_scales=True,
-                scale_tma_aligned=True,
-                alloc_func=alloc_func,
-            )
-
-        if out is None:
-            out = alloc_func((m, n), dtype=input_tensor.dtype, device=input_tensor.device)
-        _deepgemm_fp8_nt((qinput_tensor, input_scale), (qweight, weight_scale), out)
-        return out
-
-    def create_weight(
-        self, out_dim: int, in_dim: int, dtype: torch.dtype, device_id: int, num_experts: int = 1
-    ) -> WeightPack:
-        expert_prefix = (num_experts,) if num_experts > 1 else ()
-        weight = torch.empty(expert_prefix + (out_dim, in_dim), dtype=torch.float8_e4m3fn).cuda(device_id)
-        weight_scale = torch.empty(
-            expert_prefix + (out_dim // self.block_size, in_dim // self.block_size), dtype=torch.float32
-        ).cuda(device_id)
-        return WeightPack(weight=weight, weight_scale=weight_scale)
-
-    def load_weight(self, weight: torch.Tensor, weight_pack: WeightPack, start_idx: int) -> None:
-        weight_pack.weight[start_idx : start_idx + weight.shape[0]].copy_(weight)
-        return
-
-    def load_weight_scale(self, weight_scale: torch.Tensor, weight_pack: WeightPack, start_idx: int) -> None:
-        weight_pack.weight_scale[
-            start_idx // self.block_size : start_idx + weight_scale.shape[0] // self.block_size
-        ].copy_(weight_scale)
-        return
-
-    def load_weight_zero_point(self, weight_zero_point: torch.Tensor, weight_pack: WeightPack, start_idx: int) -> None:
-        weight_pack.weight_zero_point[
-            start_idx // self.block_size : start_idx + weight_zero_point.shape[0] // self.block_size
-        ].copy_(weight_zero_point)
-        return
-
-
-def _deepgemm_fp8_nt(a_tuple, b_tuple, out):
-    if HAS_DEEPGEMM:
-        if hasattr(deep_gemm, "gemm_fp8_fp8_bf16_nt"):
-            return deep_gemm.gemm_fp8_fp8_bf16_nt([a_tuple[0], a_tuple[1]], [b_tuple[0], b_tuple[1]], out)
-        if hasattr(deep_gemm, "fp8_gemm_nt"):
-            return deep_gemm.fp8_gemm_nt((a_tuple[0], a_tuple[1]), (b_tuple[0], b_tuple[1]), out)
-    raise RuntimeError("deep_gemm does not provide fp8 NT GEMM kernel in this version")
diff --git a/lightllm/common/quantization/triton_quant/triton_quant.py b/lightllm/common/quantization/triton_quant/triton_quant.py
deleted file mode 100644
index 9f6a7bee2..000000000
--- a/lightllm/common/quantization/triton_quant/triton_quant.py
+++ /dev/null
@@ -1,112 +0,0 @@
-import os
-import torch
-import torch.nn.functional as F
-from lightllm.common.quantization.quantize_method import QuantizationMethod
-from lightllm.common.quantization.registry import QUANTMETHODS
-from .fp8.fp8w8a8_block_gemm_kernel import w8a8_block_fp8_matmul
-from .fp8.fp8act_quant_kernel import per_token_group_quant_fp8
-from typing import TYPE_CHECKING, Optional
-
-from lightllm.common.quantization.quantize_method import WeightPack
-
-
-class TritonBaseQuantizationMethod(QuantizationMethod):
-    def __init__(self):
-        super().__init__()
-        from lightllm.common.basemodel.layer_infer.cache_tensor_manager import g_cache_manager
-
-        self.cache_manager = g_cache_manager
-
-    def quantize(self, weight: torch.Tensor, output: WeightPack, offset: int = 0) -> WeightPack:
-        raise NotImplementedError("Not implemented")
-
-    def apply(
-        self,
-        input_tensor: torch.Tensor,
-        weight_pack: WeightPack,
-        out: Optional[torch.Tensor] = None,
-        workspace: Optional[torch.Tensor] = None,
-        use_custom_tensor_mananger: bool = True,
-        bias: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
-        raise NotImplementedError("Not implemented")
-
-
-@QUANTMETHODS.register(["triton-fp8w8a8-block128"])
-class TritonFP8w8a8QuantizationMethod(TritonBaseQuantizationMethod):
-    def __init__(self):
-        super().__init__()
-        self.is_moe = False
-        self.block_size = 128
-        self.weight_suffix = None
-        self.weight_zero_point_suffix = None
-        self.weight_scale_suffix = "weight_scale_inv"
-        self.has_weight_scale = True
-        self.has_weight_zero_point = False
-
-    def quantize(self, weight: torch.Tensor, output: WeightPack, offset: int = 0) -> None:
-        # TODO block-wise quant kernel
-        raise NotImplementedError("Not implemented")
-
-    def apply(
-        self,
-        input_tensor: torch.Tensor,
-        weight_pack: WeightPack,
-        out: Optional[torch.Tensor] = None,
-        workspace: Optional[torch.Tensor] = None,
-        use_custom_tensor_mananger: bool = True,
-        bias: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
-        qweight = weight_pack.weight
-        weight_scale = weight_pack.weight_scale
-        input_scale = None
-        m, k = input_tensor.shape
-        n = qweight.shape[1]
-        alloc_func = torch.empty if not use_custom_tensor_mananger else self.cache_manager.empty
-        if input_scale is None:
-            input_tensor_q, input_scale = per_token_group_quant_fp8(
-                input_tensor, self.block_size, dtype=qweight.dtype, alloc_func=alloc_func
-            )
-        else:
-            # TODO
-            raise "statci input scale is not supported by triton fp8 block gemm kernel."
-        m = input_tensor.shape[0]
-        n = qweight.shape[1]
-        if out is None:
-            out = alloc_func((m, n), dtype=input_tensor.dtype, device=input_tensor.device)
-        w8a8_block_fp8_matmul(
-            input_tensor_q,
-            qweight,
-            input_scale,
-            weight_scale,
-            out,
-            (self.block_size, self.block_size),
-            dtype=input_tensor.dtype,
-        )
-        return out
-
-    def create_weight(
-        self, out_dim: int, in_dim: int, dtype: torch.dtype, device_id: int, num_experts: int = 1
-    ) -> WeightPack:
-        expert_prefix = (num_experts,) if num_experts > 1 else ()
-        weight = torch.empty(expert_prefix + (out_dim, in_dim), dtype=torch.float8_e4m3fn).cuda(device_id)
-        weight_scale = torch.empty(
-            expert_prefix + (out_dim // self.block_size, in_dim // self.block_size), dtype=torch.float32
-        ).cuda(device_id)
-        return WeightPack(weight=weight, weight_scale=weight_scale)
-
-    def load_weight(self, weight: torch.Tensor, weight_pack: WeightPack, start_idx: int) -> None:
-        weight_pack.weight[start_idx : start_idx + weight.shape[0]].copy_(weight)
-        return
-
-    def load_weight_scale(self, weight_scale: torch.Tensor, weight_pack: WeightPack, start_idx: int) -> None:
-        weight_pack.weight_scale[
-            start_idx // self.block_size : start_idx + weight_scale.shape[0] // self.block_size
-        ].copy_(weight_scale)
-        return
-
-    def load_weight_zero_point(self, weight_zero_point: torch.Tensor, weight_pack: WeightPack, start_idx: int) -> None:
-        weight_pack.weight_zero_point[
-            start_idx // self.block_size : start_idx + weight_zero_point.shape[0] // self.block_size
-        ].copy_(weight_zero_point)
-        return
diff --git a/lightllm/common/quantization/types/__init__.py b/lightllm/common/quantization/types/__init__.py
new file mode 100644
index 000000000..8cbcc2e68
--- /dev/null
+++ b/lightllm/common/quantization/types/__init__.py
@@ -0,0 +1,13 @@
+from .no_quant import NoQuantization
+from .fp8_block128 import FP8Block128Quantization
+from .fp8_per_token import FP8PerTokenQuantization
+from .w8a8 import W8A8Quantization
+from .awq import AWQQuantization
+
+__all__ = [
+    "NoQuantization",
+    "FP8Block128Quantization",
+    "FP8PerTokenQuantization",
+    "W8A8Quantization",
+    "AWQQuantization",
+]
diff --git a/lightllm/common/quantization/awq_quant.py b/lightllm/common/quantization/types/awq.py
similarity index 62%
rename from lightllm/common/quantization/awq_quant.py
rename to lightllm/common/quantization/types/awq.py
index d523cce75..eedc5b67b 100644
--- a/lightllm/common/quantization/awq_quant.py
+++ b/lightllm/common/quantization/types/awq.py
@@ -1,66 +1,78 @@
-import os
 import torch
-from .quantize_method import QuantizationMethod
-from .registry import QUANTMETHODS
-import torch.nn.functional as F
-from lightllm.utils.vllm_utils import HAS_VLLM, vllm_ops, cutlass_scaled_mm
-from lightllm.utils.light_utils import HAS_LIGHTLLM_KERNEL, light_ops
-from typing import Any
-from typing import TYPE_CHECKING, Optional, Tuple
-from lightllm.utils.dist_utils import get_current_device_id
-
-from .quantize_method import WeightPack
-
-if HAS_VLLM:
-    awq_dequantize = vllm_ops.awq_dequantize
-    awq_gemm = vllm_ops.awq_gemm
-    from vllm.model_executor.layers.quantization.utils.marlin_utils import (
-        check_marlin_supported,
-        marlin_permute_scales,
-        awq_to_marlin_zero_points,
-        should_use_atomic_add_reduce,
-        marlin_make_empty_g_idx,
-        marlin_make_workspace_new,
-    )
-    from vllm.scalar_type import scalar_types
+from typing import Any, Optional, Tuple
 
-    TYPE_MAP = {
-        4: scalar_types.uint4,
-        8: scalar_types.uint8,
-    }
+from lightllm.common.quantization.quantize_method import QuantizationMethod, WeightPack
+from lightllm.common.quantization.registry import QUANTMETHODS
+from lightllm.utils.dist_utils import get_current_device_id
+from lightllm.utils.log_utils import init_logger
+
+logger = init_logger(__name__)
+
+try:
+    from lightllm.utils.vllm_utils import HAS_VLLM, vllm_ops
+
+    if HAS_VLLM:
+        awq_dequantize = vllm_ops.awq_dequantize
+        awq_gemm = vllm_ops.awq_gemm
+        from vllm.model_executor.layers.quantization.utils.marlin_utils import (
+            check_marlin_supported,
+            marlin_permute_scales,
+            awq_to_marlin_zero_points,
+            should_use_atomic_add_reduce,
+            marlin_make_empty_g_idx,
+            marlin_make_workspace_new,
+        )
+        from vllm.scalar_type import scalar_types
+
+        TYPE_MAP = {
+            4: scalar_types.uint4,
+            8: scalar_types.uint8,
+        }
+    else:
+        awq_dequantize = None
+        awq_gemm = None
+        TYPE_MAP = {}
+except ImportError:
+    HAS_VLLM = False
+    awq_dequantize = None
+    awq_gemm = None
+    TYPE_MAP = {}
+
+
+def is_awq_marlin_compatible(quantization_config: dict[str, Any]) -> bool:
+    if not HAS_VLLM:
+        return False
 
+    quant_method = quantization_config.get("quant_method", "").lower()
+    num_bits = quantization_config.get("bits")
+    group_size = quantization_config.get("group_size")
+    zero_point = quantization_config.get("zero_point")
 
-class AWQBaseQuantizationMethod(QuantizationMethod):
-    def __init__(self):
-        super().__init__()
-        assert HAS_VLLM, "vllm are not installed, you can't use quant api of them."
-        from lightllm.common.basemodel.layer_infer.cache_tensor_manager import g_cache_manager
+    if not torch.cuda.is_available():
+        return False
 
-        self.cache_manager = g_cache_manager
+    if quant_method != "awq":
+        return False
 
-    def quantize(self, weight: torch.Tensor, output: WeightPack, offset: int = 0):
-        raise NotImplementedError("AWQ online quantization is not supported yet.")
+    if num_bits is None or group_size is None or zero_point is None:
+        return False
 
-    def apply(
-        self,
-        input_tensor: torch.Tensor,
-        weight_pack: WeightPack,
-        out: Optional[torch.Tensor] = None,
-        workspace: Optional[torch.Tensor] = None,
-        use_custom_tensor_mananger: bool = True,
-        bias: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
-        raise NotImplementedError("AWQ online quantization is not supported yet.")
+    if num_bits not in TYPE_MAP:
+        return False
 
-    @property
-    def method_name(self):
-        return "awq-base"
+    return check_marlin_supported(quant_type=TYPE_MAP[num_bits], group_size=group_size, has_zp=zero_point)
 
 
-@QUANTMETHODS.register("awq")
-class AWQW4A16QuantizationMethod(AWQBaseQuantizationMethod):
+@QUANTMETHODS.register(["awq", "awq_marlin"])
+class AWQQuantization(QuantizationMethod):
     def __init__(self):
         super().__init__()
+        from lightllm.common.basemodel.layer_infer.cache_tensor_manager import g_cache_manager
+
+        if not HAS_VLLM:
+            raise RuntimeError("vLLM is required for AWQ quantization but is not installed.")
+
+        self.cache_manager = g_cache_manager
         self.pack_factor = 8
         self.weight_scale_suffix = "scales"
         self.weight_zero_point_suffix = "qzeros"
@@ -68,11 +80,38 @@ def __init__(self):
         self.has_weight_scale = True
         self.has_weight_zero_point = True
 
+        self._use_marlin = False
+        self._marlin_initialized = False
+
+    def _init_marlin(self):
+        if self._marlin_initialized:
+            return
+
+        self.nbits = 4
+        self.g_idx = marlin_make_empty_g_idx(torch.device("cuda"))
+        self.g_idx_sort_indices = marlin_make_empty_g_idx(torch.device("cuda"))
+        self.workspace = marlin_make_workspace_new(torch.device("cuda"))
+        self.vllm_quant_type = TYPE_MAP[self.nbits]
+        self.tile_size = 16
+        self._marlin_initialized = True
+
+    def _check_and_set_marlin(self):
+        if self.hf_quantization_config is None:
+            self._use_marlin = False
+            return
+
+        self._use_marlin = is_awq_marlin_compatible(self.hf_quantization_config)
+        if self._use_marlin:
+            self._init_marlin()
+            logger.info("AWQQuantization using Marlin backend")
+        else:
+            logger.info("AWQQuantization using basic AWQ backend")
+
     @property
     def method_name(self):
         return "awq"
 
-    def quantize(self, weight: torch.Tensor, output: WeightPack, offset: int = 0):
+    def quantize(self, weight: torch.Tensor, output: WeightPack, offset: int = 0) -> None:
         raise NotImplementedError("AWQ online quantization is not supported yet.")
 
     def apply(
@@ -83,6 +122,22 @@ def apply(
         workspace: Optional[torch.Tensor] = None,
         use_custom_tensor_mananger: bool = True,
         bias: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        if not hasattr(self, "_checked_marlin"):
+            self._check_and_set_marlin()
+            self._checked_marlin = True
+
+        if self._use_marlin:
+            return self._apply_marlin(input_tensor, weight_pack, out, bias)
+        else:
+            return self._apply_basic(input_tensor, weight_pack, out, bias)
+
+    def _apply_basic(
+        self,
+        input_tensor: torch.Tensor,
+        weight_pack: WeightPack,
+        out: Optional[torch.Tensor],
+        bias: Optional[torch.Tensor],
     ) -> torch.Tensor:
         qweight = weight_pack.weight
         weight_scale = weight_pack.weight_scale
@@ -99,81 +154,12 @@ def apply(
             out.add_(bias)
         return out
 
-    def create_weight(
-        self, out_dim: int, in_dim: int, dtype: torch.dtype, device_id: int, num_experts: int = 1
-    ) -> WeightPack:
-        group_size = self.hf_quantization_config["group_size"]
-        expert_prefix = (num_experts,) if num_experts > 1 else ()
-        weight = torch.empty(expert_prefix + (in_dim, out_dim // self.pack_factor), dtype=torch.int32).cuda(device_id)
-        weight_scale = torch.empty(expert_prefix + (in_dim // group_size, out_dim), dtype=dtype).cuda(device_id)
-        weight_zero_point = torch.empty(
-            expert_prefix + (in_dim // group_size, out_dim // self.pack_factor), dtype=torch.int32
-        ).cuda(device_id)
-        return WeightPack(weight=weight, weight_scale=weight_scale, weight_zero_point=weight_zero_point)
-
-    def load_weight(self, weight: torch.Tensor, weight_pack: WeightPack, start_idx: int) -> None:
-        start_idx = start_idx // self.pack_factor
-        weight_pack.weight[:, start_idx : start_idx + weight.shape[1]].copy_(weight)
-        return
-
-    def load_weight_scale(self, weight_scale: torch.Tensor, weight_pack: WeightPack, start_idx: int) -> None:
-        weight_pack.weight_scale[:, start_idx : start_idx + weight_scale.shape[1]].copy_(weight_scale)
-        return
-
-    def load_weight_zero_point(self, weight_zero_point: torch.Tensor, weight_pack: WeightPack, start_idx: int) -> None:
-        start_idx = start_idx // self.pack_factor
-        end_idx = start_idx + weight_zero_point.shape[1]
-        weight_pack.weight_zero_point[:, start_idx:end_idx].copy_(weight_zero_point)
-        return
-
-
-@QUANTMETHODS.register("awq_marlin")
-class AWQMARLINW4A16QuantizationMethod(AWQBaseQuantizationMethod):
-    def __init__(self):
-        super().__init__()
-        self.pack_factor = 8
-        self.nbits = 4
-        self.weight_scale_suffix = "scales"
-        self.weight_zero_point_suffix = "qzeros"
-        self.weight_suffix = "qweight"
-        self.g_idx = marlin_make_empty_g_idx(torch.device("cuda"))
-        self.g_idx_sort_indices = marlin_make_empty_g_idx(torch.device("cuda"))
-        self.workspace = marlin_make_workspace_new(torch.device("cuda"))
-        self.vllm_quant_type = TYPE_MAP[self.nbits]
-        self.has_weight_scale = True
-        self.has_weight_zero_point = True
-        self.tile_size = 16
-
-    @property
-    def method_name(self):
-        return "awq_marlin"
-
-    def quantize(self, weight: torch.Tensor, offset: int = 0) -> WeightPack:
-        raise NotImplementedError("AWQ online quantization is not supported yet.")
-
-    def params_repack(
-        self, weight: torch.Tensor, weight_scale: torch.Tensor, weight_zero_point: torch.Tensor, dtype_type: torch.dtype
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        """
-        一些量化方法在将参数完成量化后，为了加速性能，还需要将参数进行重拍，使算子性能达到最优，如awq方法。
-        """
-        weight = self._process_weight_after_loading(weight.cuda(get_current_device_id()))
-        weight_scale = self._process_weight_scale_after_loading(
-            weight_scale.cuda(get_current_device_id()).to(dtype_type)
-        )
-        weight_zero_point = self._process_weight_zero_point_after_loading(
-            weight_zero_point.cuda(get_current_device_id())
-        )
-        return weight, weight_scale, weight_zero_point
-
-    def apply(
+    def _apply_marlin(
         self,
         input_tensor: torch.Tensor,
         weight_pack: WeightPack,
-        out: Optional[torch.Tensor] = None,
-        workspace: Optional[torch.Tensor] = None,
-        use_custom_tensor_mananger: bool = True,
-        bias: Optional[torch.Tensor] = None,
+        out: Optional[torch.Tensor],
+        bias: Optional[torch.Tensor],
     ) -> torch.Tensor:
         qweight = weight_pack.weight
         weight_scale = weight_pack.weight_scale
@@ -214,6 +200,30 @@ def apply(
 
     def create_weight(
         self, out_dim: int, in_dim: int, dtype: torch.dtype, device_id: int, num_experts: int = 1
+    ) -> WeightPack:
+        if not hasattr(self, "_checked_marlin"):
+            self._check_and_set_marlin()
+            self._checked_marlin = True
+
+        if self._use_marlin:
+            return self._create_weight_marlin(out_dim, in_dim, dtype, device_id, num_experts)
+        else:
+            return self._create_weight_basic(out_dim, in_dim, dtype, device_id, num_experts)
+
+    def _create_weight_basic(
+        self, out_dim: int, in_dim: int, dtype: torch.dtype, device_id: int, num_experts: int = 1
+    ) -> WeightPack:
+        group_size = self.hf_quantization_config["group_size"]
+        expert_prefix = (num_experts,) if num_experts > 1 else ()
+        weight = torch.empty(expert_prefix + (in_dim, out_dim // self.pack_factor), dtype=torch.int32).cuda(device_id)
+        weight_scale = torch.empty(expert_prefix + (in_dim // group_size, out_dim), dtype=dtype).cuda(device_id)
+        weight_zero_point = torch.empty(
+            expert_prefix + (in_dim // group_size, out_dim // self.pack_factor), dtype=torch.int32
+        ).cuda(device_id)
+        return WeightPack(weight=weight, weight_scale=weight_scale, weight_zero_point=weight_zero_point)
+
+    def _create_weight_marlin(
+        self, out_dim: int, in_dim: int, dtype: torch.dtype, device_id: int, num_experts: int = 1
     ) -> WeightPack:
         self.n = out_dim
         self.k = in_dim
@@ -229,6 +239,20 @@ def create_weight(
         return WeightPack(weight=weight, weight_scale=weight_scale, weight_zero_point=weight_zero_point)
 
     def load_weight(self, weight: torch.Tensor, weight_pack: WeightPack, start_idx: int) -> None:
+        if not hasattr(self, "_checked_marlin"):
+            self._check_and_set_marlin()
+            self._checked_marlin = True
+
+        if self._use_marlin:
+            self._load_weight_marlin(weight, weight_pack, start_idx)
+        else:
+            self._load_weight_basic(weight, weight_pack, start_idx)
+
+    def _load_weight_basic(self, weight: torch.Tensor, weight_pack: WeightPack, start_idx: int) -> None:
+        start_idx = start_idx // self.pack_factor
+        weight_pack.weight[:, start_idx : start_idx + weight.shape[1]].copy_(weight)
+
+    def _load_weight_marlin(self, weight: torch.Tensor, weight_pack: WeightPack, start_idx: int) -> None:
         assert self.hf_quantization_config is not None, "hf_quantization_config is not set"
         device_id = get_current_device_id()
         repack_weight = vllm_ops.awq_marlin_repack(
@@ -239,9 +263,21 @@ def load_weight(self, weight: torch.Tensor, weight_pack: WeightPack, start_idx:
         )
         start_idx = start_idx // self.pack_factor * self.tile_size
         weight_pack.weight[:, start_idx : start_idx + repack_weight.shape[1]].copy_(repack_weight)
-        return
 
     def load_weight_scale(self, weight_scale: torch.Tensor, weight_pack: WeightPack, start_idx: int) -> None:
+        if not hasattr(self, "_checked_marlin"):
+            self._check_and_set_marlin()
+            self._checked_marlin = True
+
+        if self._use_marlin:
+            self._load_weight_scale_marlin(weight_scale, weight_pack, start_idx)
+        else:
+            self._load_weight_scale_basic(weight_scale, weight_pack, start_idx)
+
+    def _load_weight_scale_basic(self, weight_scale: torch.Tensor, weight_pack: WeightPack, start_idx: int) -> None:
+        weight_pack.weight_scale[:, start_idx : start_idx + weight_scale.shape[1]].copy_(weight_scale)
+
+    def _load_weight_scale_marlin(self, weight_scale: torch.Tensor, weight_pack: WeightPack, start_idx: int) -> None:
         assert self.hf_quantization_config is not None, "hf_quantization_config is not set"
         group_size = self.hf_quantization_config["group_size"]
         device_id = get_current_device_id()
@@ -252,9 +288,27 @@ def load_weight_scale(self, weight_scale: torch.Tensor, weight_pack: WeightPack,
             group_size=self.hf_quantization_config["group_size"],
         )
         weight_pack.weight_scale[:, start_idx : start_idx + repack_weight_scale.shape[1]].copy_(repack_weight_scale)
-        return
 
     def load_weight_zero_point(self, weight_zero_point: torch.Tensor, weight_pack: WeightPack, start_idx: int) -> None:
+        if not hasattr(self, "_checked_marlin"):
+            self._check_and_set_marlin()
+            self._checked_marlin = True
+
+        if self._use_marlin:
+            self._load_weight_zero_point_marlin(weight_zero_point, weight_pack, start_idx)
+        else:
+            self._load_weight_zero_point_basic(weight_zero_point, weight_pack, start_idx)
+
+    def _load_weight_zero_point_basic(
+        self, weight_zero_point: torch.Tensor, weight_pack: WeightPack, start_idx: int
+    ) -> None:
+        start_idx = start_idx // self.pack_factor
+        end_idx = start_idx + weight_zero_point.shape[1]
+        weight_pack.weight_zero_point[:, start_idx:end_idx].copy_(weight_zero_point)
+
+    def _load_weight_zero_point_marlin(
+        self, weight_zero_point: torch.Tensor, weight_pack: WeightPack, start_idx: int
+    ) -> None:
         device_id = get_current_device_id()
         repack_weight_zero_point = awq_to_marlin_zero_points(
             weight_zero_point.cuda(device_id),
@@ -266,29 +320,3 @@ def load_weight_zero_point(self, weight_zero_point: torch.Tensor, weight_pack: W
         weight_pack.weight_zero_point[:, start_idx : start_idx + repack_weight_zero_point.shape[1]].copy_(
             repack_weight_zero_point
         )
-        return
-
-
-# adapted from
-# https://github.com/vllm-project/vllm/blob/aef368aa08572505b820db01da82e2fbb3d43a72/vllm/model_executor/layers/quantization/awq_marlin.py#L211-L212
-def is_awq_marlin_compatible(quantization_config: dict[str, Any]):
-    # Extract data from quant config.
-    quant_method = quantization_config.get("quant_method", "").lower()
-    num_bits = quantization_config.get("bits")
-    group_size = quantization_config.get("group_size")
-    zero_point = quantization_config.get("zero_point")
-
-    if not torch.cuda.is_available():
-        return False
-
-    if quant_method != "awq":
-        return False
-
-    # If we cannot find the info needed in the config, cannot convert.
-    if num_bits is None or group_size is None or zero_point is None:
-        return False
-
-    if num_bits not in TYPE_MAP:
-        return False
-
-    return check_marlin_supported(quant_type=TYPE_MAP[num_bits], group_size=group_size, has_zp=zero_point)
diff --git a/lightllm/common/quantization/types/fp8_block128.py b/lightllm/common/quantization/types/fp8_block128.py
new file mode 100644
index 000000000..4144dddde
--- /dev/null
+++ b/lightllm/common/quantization/types/fp8_block128.py
@@ -0,0 +1,216 @@
+import torch
+from typing import Optional
+
+from lightllm.common.quantization.quantize_method import QuantizationMethod, WeightPack
+from lightllm.common.quantization.registry import QUANTMETHODS
+from lightllm.common.quantization.backend import QUANT_BACKEND, BackendType
+from lightllm.common.quantization.triton_quant.fp8.fp8act_quant_kernel import per_token_group_quant_fp8
+from lightllm.common.quantization.triton_quant.fp8.fp8w8a8_block_gemm_kernel import w8a8_block_fp8_matmul
+from lightllm.utils.log_utils import init_logger
+
+logger = init_logger(__name__)
+
+try:
+    import deep_gemm
+
+    HAS_DEEPGEMM = True
+except ImportError:
+    HAS_DEEPGEMM = False
+
+try:
+    from lightllm.utils.vllm_utils import HAS_VLLM
+
+    if HAS_VLLM:
+        from lightllm.utils.vllm_utils import cutlass_scaled_mm
+    else:
+        cutlass_scaled_mm = None
+except ImportError:
+    HAS_VLLM = False
+    cutlass_scaled_mm = None
+
+
+def _deepgemm_fp8_nt(a_tuple, b_tuple, out):
+    if hasattr(deep_gemm, "gemm_fp8_fp8_bf16_nt"):
+        return deep_gemm.gemm_fp8_fp8_bf16_nt([a_tuple[0], a_tuple[1]], [b_tuple[0], b_tuple[1]], out)
+    if hasattr(deep_gemm, "fp8_gemm_nt"):
+        return deep_gemm.fp8_gemm_nt((a_tuple[0], a_tuple[1]), (b_tuple[0], b_tuple[1]), out)
+    raise RuntimeError("deep_gemm does not provide fp8 NT GEMM kernel in this version")
+
+
+@QUANTMETHODS.register(["fp8-block128"])
+class FP8Block128Quantization(QuantizationMethod):
+    def __init__(self):
+        super().__init__()
+        from lightllm.common.basemodel.layer_infer.cache_tensor_manager import g_cache_manager
+
+        self.cache_manager = g_cache_manager
+        self.block_size = 128
+        self.weight_scale_suffix = "weight_scale_inv"
+        self.has_weight_scale = True
+        self.has_weight_zero_point = False
+
+        self._backend = QUANT_BACKEND.get_backend("fp8-block128")
+        logger.info(f"FP8Block128Quantization using backend: {self._backend.name}")
+
+    @property
+    def method_name(self):
+        return "fp8-block128"
+
+    def quantize(self, weight: torch.Tensor, output: WeightPack, offset: int = 0) -> None:
+        from lightllm.common.quantization.triton_quant.fp8.fp8w8a8_block_quant_kernel import weight_quant
+
+        device = output.weight.device
+        weight, scale = weight_quant(weight.cuda(device), self.block_size)
+        output.weight[offset : offset + weight.shape[0], :].copy_(weight)
+        output.weight_scale[offset // self.block_size : offset + weight.shape[0] // self.block_size].copy_(scale)
+        return
+
+    def apply(
+        self,
+        input_tensor: torch.Tensor,
+        weight_pack: WeightPack,
+        out: Optional[torch.Tensor] = None,
+        workspace: Optional[torch.Tensor] = None,
+        use_custom_tensor_mananger: bool = True,
+        bias: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        alloc_func = torch.empty if not use_custom_tensor_mananger else self.cache_manager.empty
+        m, k = input_tensor.shape
+
+        if self._backend == BackendType.DEEPGEMM:
+            return self._apply_deepgemm(input_tensor, weight_pack, out, alloc_func, bias)
+        elif self._backend == BackendType.VLLM:
+            return self._apply_vllm(input_tensor, weight_pack, out, alloc_func, bias)
+        else:
+            return self._apply_triton(input_tensor, weight_pack, out, alloc_func, bias)
+
+    def _apply_deepgemm(
+        self,
+        input_tensor: torch.Tensor,
+        weight_pack: WeightPack,
+        out: Optional[torch.Tensor],
+        alloc_func,
+        bias: Optional[torch.Tensor],
+    ) -> torch.Tensor:
+        qweight = weight_pack.weight
+        weight_scale = weight_pack.weight_scale
+        m, k = input_tensor.shape
+        n = qweight.shape[0]
+
+        qinput_tensor, input_scale = per_token_group_quant_fp8(
+            input_tensor,
+            self.block_size,
+            dtype=qweight.dtype,
+            column_major_scales=True,
+            scale_tma_aligned=True,
+            alloc_func=alloc_func,
+        )
+
+        if out is None:
+            out = alloc_func((m, n), dtype=input_tensor.dtype, device=input_tensor.device)
+
+        _deepgemm_fp8_nt((qinput_tensor, input_scale), (qweight, weight_scale), out)
+
+        if bias is not None:
+            out.add_(bias)
+        return out
+
+    def _apply_vllm(
+        self,
+        input_tensor: torch.Tensor,
+        weight_pack: WeightPack,
+        out: Optional[torch.Tensor],
+        alloc_func,
+        bias: Optional[torch.Tensor],
+    ) -> torch.Tensor:
+        qweight = weight_pack.weight.t()
+        weight_scale = weight_pack.weight_scale.t()
+        m, k = input_tensor.shape
+        n = qweight.shape[1]
+
+        qinput_tensor, input_scale = per_token_group_quant_fp8(
+            input_tensor, self.block_size, dtype=qweight.dtype, alloc_func=alloc_func
+        )
+
+        if out is None:
+            out = alloc_func((m, n), dtype=input_tensor.dtype, device=input_tensor.device)
+
+        if n % 128 != 0:
+            w8a8_block_fp8_matmul(
+                qinput_tensor,
+                qweight,
+                input_scale,
+                weight_scale,
+                out,
+                (self.block_size, self.block_size),
+                dtype=input_tensor.dtype,
+            )
+        else:
+            input_scale = input_scale.t().contiguous().t()
+            cutlass_scaled_mm(out, qinput_tensor, qweight, input_scale, weight_scale, bias)
+            return out
+
+        if bias is not None:
+            out.add_(bias)
+        return out
+
+    def _apply_triton(
+        self,
+        input_tensor: torch.Tensor,
+        weight_pack: WeightPack,
+        out: Optional[torch.Tensor],
+        alloc_func,
+        bias: Optional[torch.Tensor],
+    ) -> torch.Tensor:
+        qweight = weight_pack.weight
+        weight_scale = weight_pack.weight_scale
+        m, k = input_tensor.shape
+        n = qweight.shape[1]
+
+        qinput_tensor, input_scale = per_token_group_quant_fp8(
+            input_tensor, self.block_size, dtype=qweight.dtype, alloc_func=alloc_func
+        )
+
+        if out is None:
+            out = alloc_func((m, n), dtype=input_tensor.dtype, device=input_tensor.device)
+
+        w8a8_block_fp8_matmul(
+            qinput_tensor,
+            qweight,
+            input_scale,
+            weight_scale,
+            out,
+            (self.block_size, self.block_size),
+            dtype=input_tensor.dtype,
+        )
+
+        if bias is not None:
+            out.add_(bias)
+        return out
+
+    def create_weight(
+        self, out_dim: int, in_dim: int, dtype: torch.dtype, device_id: int, num_experts: int = 1
+    ) -> WeightPack:
+        expert_prefix = (num_experts,) if num_experts > 1 else ()
+        weight = torch.empty(expert_prefix + (out_dim, in_dim), dtype=torch.float8_e4m3fn).cuda(device_id)
+        weight_scale = torch.empty(
+            expert_prefix + (out_dim // self.block_size, in_dim // self.block_size), dtype=torch.float32
+        ).cuda(device_id)
+        return WeightPack(weight=weight, weight_scale=weight_scale)
+
+    def load_weight(self, weight: torch.Tensor, weight_pack: WeightPack, start_idx: int) -> None:
+        weight_pack.weight[start_idx : start_idx + weight.shape[0]].copy_(weight)
+        return
+
+    def load_weight_scale(self, weight_scale: torch.Tensor, weight_pack: WeightPack, start_idx: int) -> None:
+        weight_pack.weight_scale[
+            start_idx // self.block_size : start_idx + weight_scale.shape[0] // self.block_size
+        ].copy_(weight_scale)
+        return
+
+    def load_weight_zero_point(self, weight_zero_point: torch.Tensor, weight_pack: WeightPack, start_idx: int) -> None:
+        if weight_pack.weight_zero_point is not None:
+            weight_pack.weight_zero_point[
+                start_idx // self.block_size : start_idx + weight_zero_point.shape[0] // self.block_size
+            ].copy_(weight_zero_point)
+        return
diff --git a/lightllm/common/quantization/types/fp8_per_token.py b/lightllm/common/quantization/types/fp8_per_token.py
new file mode 100644
index 000000000..c49bc89ff
--- /dev/null
+++ b/lightllm/common/quantization/types/fp8_per_token.py
@@ -0,0 +1,172 @@
+import torch
+from typing import Optional
+
+from lightllm.common.quantization.quantize_method import QuantizationMethod, WeightPack
+from lightllm.common.quantization.registry import QUANTMETHODS
+from lightllm.common.quantization.backend import QUANT_BACKEND, BackendType
+from lightllm.common.quantization.triton_quant.fp8.fp8w8a8_scaled_mm_per_token_kernel import fp8_scaled_mm_per_token
+from lightllm.utils.log_utils import init_logger
+
+logger = init_logger(__name__)
+
+try:
+    from lightllm.utils.vllm_utils import HAS_VLLM
+
+    if HAS_VLLM:
+        from lightllm.utils.vllm_utils import vllm_ops, cutlass_scaled_mm
+    else:
+        vllm_ops = None
+        cutlass_scaled_mm = None
+except ImportError:
+    HAS_VLLM = False
+    vllm_ops = None
+    cutlass_scaled_mm = None
+
+try:
+    from lightllm.utils.light_utils import HAS_LIGHTLLM_KERNEL, light_ops
+
+    if HAS_LIGHTLLM_KERNEL:
+
+        def scaled_fp8_quant(tensor, *args, **kwargs):
+            return light_ops.per_token_quant_bf16_fp8(tensor)
+
+    else:
+        if HAS_VLLM:
+            scaled_fp8_quant = vllm_ops.scaled_fp8_quant
+        else:
+            scaled_fp8_quant = None
+except ImportError:
+    HAS_LIGHTLLM_KERNEL = False
+    if HAS_VLLM:
+        scaled_fp8_quant = vllm_ops.scaled_fp8_quant
+    else:
+        scaled_fp8_quant = None
+
+
+@QUANTMETHODS.register(["fp8-per-token", "fp8w8a8"])
+class FP8PerTokenQuantization(QuantizationMethod):
+    def __init__(self):
+        super().__init__()
+        from lightllm.common.basemodel.layer_infer.cache_tensor_manager import g_cache_manager
+
+        self.cache_manager = g_cache_manager
+        self.is_moe = False
+        self.has_weight_scale = True
+        self.has_weight_zero_point = False
+        self._backend = QUANT_BACKEND.get_backend("fp8-per-token")
+        logger.info(f"FP8PerTokenQuantization using backend: {self._backend.name}")
+
+    @property
+    def method_name(self):
+        return "fp8-per-token"
+
+    def quantize(self, weight: torch.Tensor, output: WeightPack, offset: int = 0) -> None:
+        """Quantize weights using per-token FP8 quantization."""
+        if self.is_moe:
+            return self._quantize_moe(weight, output, offset)
+
+        if scaled_fp8_quant is None:
+            raise RuntimeError("No FP8 quantization kernel available. Install vLLM or lightllm-kernel.")
+
+        qweight, weight_scale = scaled_fp8_quant(
+            weight.cuda(self.device_id_), scale=None, use_per_token_if_dynamic=True
+        )
+        output.weight[offset : offset + qweight.shape[0], :].copy_(qweight)
+        output.weight_scale[offset : offset + weight_scale.shape[0]].copy_(weight_scale.view(-1))
+        return
+
+    def _quantize_moe(self, weight: torch.Tensor, output: WeightPack, offset: int) -> None:
+        if scaled_fp8_quant is None:
+            raise RuntimeError("No FP8 quantization kernel available. Install vLLM or lightllm-kernel.")
+
+        num_experts = weight.shape[0]
+        qweights = torch.empty_like(weight, dtype=torch.float8_e4m3fn).cuda(self.device_id_)
+        weight_scales = []
+        for i in range(num_experts):
+            qweight, weight_scale = scaled_fp8_quant(
+                weight[i].contiguous().cuda(self.device_id_), scale=None, use_per_token_if_dynamic=True
+            )
+            qweights[i] = qweight
+            weight_scales.append(weight_scale)
+        weight_scale = torch.stack(weight_scales, dim=0).contiguous()
+        output.weight.copy_(qweights)
+        output.weight_scale.copy_(weight_scale)
+        return
+
+    def apply(
+        self,
+        input_tensor: torch.Tensor,
+        weight_pack: WeightPack,
+        out: Optional[torch.Tensor] = None,
+        workspace: Optional[torch.Tensor] = None,
+        use_custom_tensor_mananger: bool = True,
+        bias: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        if self._backend == BackendType.TRITON:
+            return self._apply_triton(input_tensor, weight_pack, out, use_custom_tensor_mananger, bias)
+        else:
+            return self._apply_vllm(input_tensor, weight_pack, out, use_custom_tensor_mananger, bias)
+
+    def _apply_vllm(
+        self,
+        input_tensor: torch.Tensor,
+        weight_pack: WeightPack,
+        out: Optional[torch.Tensor],
+        use_custom_tensor_mananger: bool,
+        bias: Optional[torch.Tensor],
+    ) -> torch.Tensor:
+        qweight = weight_pack.weight.t()
+        weight_scale = weight_pack.weight_scale
+
+        x_q, x_scale = scaled_fp8_quant(input_tensor, scale=None, scale_ub=None, use_per_token_if_dynamic=True)
+
+        m = input_tensor.shape[0]
+        n = qweight.shape[1]
+
+        if out is None:
+            if use_custom_tensor_mananger:
+                out = self.cache_manager.alloc_tensor((m, n), input_tensor.dtype, device=input_tensor.device)
+            else:
+                out = torch.empty((m, n), dtype=input_tensor.dtype, device=input_tensor.device)
+
+        cutlass_scaled_mm(out, x_q, qweight, x_scale, weight_scale, bias)
+        return out
+
+    def _apply_triton(
+        self,
+        input_tensor: torch.Tensor,
+        weight_pack: WeightPack,
+        out: Optional[torch.Tensor],
+        use_custom_tensor_mananger: bool,
+        bias: Optional[torch.Tensor],
+    ) -> torch.Tensor:
+        qweight = weight_pack.weight.t()
+        weight_scale = weight_pack.weight_scale
+
+        if scaled_fp8_quant is None:
+            raise RuntimeError("No FP8 quantization kernel available. Install vLLM or lightllm-kernel.")
+
+        x_q, x_scale = scaled_fp8_quant(input_tensor, scale=None, scale_ub=None, use_per_token_if_dynamic=True)
+
+        m = input_tensor.shape[0]
+        n = qweight.shape[1]
+
+        if out is None:
+            if use_custom_tensor_mananger:
+                out = self.cache_manager.alloc_tensor((m, n), input_tensor.dtype, device=input_tensor.device)
+            else:
+                out = torch.empty((m, n), dtype=input_tensor.dtype, device=input_tensor.device)
+
+        out = fp8_scaled_mm_per_token(x_q, qweight, x_scale, weight_scale, input_tensor.dtype, out)
+
+        if bias is not None:
+            out.add_(bias)
+        return out
+
+    def create_weight(
+        self, out_dim: int, in_dim: int, dtype: torch.dtype, device_id: int, num_experts: int = 1
+    ) -> WeightPack:
+        expert_prefix = (num_experts,) if num_experts > 1 else ()
+        weight = torch.empty(expert_prefix + (out_dim, in_dim), dtype=torch.float8_e4m3fn).cuda(device_id)
+        weight_scale = torch.empty(expert_prefix + (out_dim,), dtype=torch.float32).cuda(device_id)
+        return WeightPack(weight=weight, weight_scale=weight_scale)
diff --git a/lightllm/common/quantization/no_quant.py b/lightllm/common/quantization/types/no_quant.py
similarity index 90%
rename from lightllm/common/quantization/no_quant.py
rename to lightllm/common/quantization/types/no_quant.py
index 987601c5d..e92d821c1 100644
--- a/lightllm/common/quantization/no_quant.py
+++ b/lightllm/common/quantization/types/no_quant.py
@@ -1,11 +1,14 @@
-from .quantize_method import QuantizationMethod, WeightPack
-from .registry import QUANTMETHODS
 import torch
 from typing import Optional
 
+from lightllm.common.quantization.quantize_method import QuantizationMethod, WeightPack
+from lightllm.common.quantization.registry import QUANTMETHODS
+
 
 @QUANTMETHODS.register("none")
 class NoQuantization(QuantizationMethod):
+    """No quantization - uses full precision weights."""
+
     def apply(
         self,
         input_tensor: torch.Tensor,
diff --git a/lightllm/common/quantization/types/w8a8.py b/lightllm/common/quantization/types/w8a8.py
new file mode 100644
index 000000000..e3b0ef592
--- /dev/null
+++ b/lightllm/common/quantization/types/w8a8.py
@@ -0,0 +1,108 @@
+import torch
+from typing import Optional
+
+from lightllm.common.quantization.quantize_method import QuantizationMethod, WeightPack
+from lightllm.common.quantization.registry import QUANTMETHODS
+from lightllm.common.quantization.backend import QUANT_BACKEND, BackendType
+from lightllm.utils.log_utils import init_logger
+
+logger = init_logger(__name__)
+
+# Conditional imports for optional backends
+try:
+    from lightllm.utils.vllm_utils import HAS_VLLM
+
+    if HAS_VLLM:
+        from lightllm.utils.vllm_utils import vllm_ops, cutlass_scaled_mm
+    else:
+        vllm_ops = None
+        cutlass_scaled_mm = None
+except ImportError:
+    HAS_VLLM = False
+    vllm_ops = None
+    cutlass_scaled_mm = None
+
+
+@QUANTMETHODS.register(["w8a8", "vllm-w8a8"])
+class W8A8Quantization(QuantizationMethod):
+    def __init__(self):
+        super().__init__()
+        from lightllm.common.basemodel.layer_infer.cache_tensor_manager import g_cache_manager
+
+        self.cache_manager = g_cache_manager
+        self.has_weight_scale = True
+        self.has_weight_zero_point = False
+
+        self._backend = QUANT_BACKEND.get_backend("w8a8")
+
+        if self._backend == BackendType.TRITON:
+            if not HAS_VLLM:
+                raise NotImplementedError(
+                    "W8A8 Triton fallback is not yet implemented. "
+                    "Please install vLLM or disable LIGHTLLM_USE_TRITON_QUANT."
+                )
+            self._backend = BackendType.VLLM
+            logger.warning("W8A8 Triton fallback not implemented, falling back to vLLM backend")
+
+        if self._backend == BackendType.VLLM and not HAS_VLLM:
+            raise RuntimeError("vLLM is required for W8A8 quantization but is not installed.")
+
+        logger.info(f"W8A8Quantization using backend: {self._backend.name}")
+
+    @property
+    def method_name(self):
+        return "w8a8"
+
+    def quantize(self, weight: torch.Tensor, output: WeightPack, offset: int = 0) -> None:
+        weight = weight.float().cuda(self.device_id_)
+        scale = weight.abs().max(dim=-1)[0] / 127
+        weight = weight / scale.reshape(-1, 1)
+        weight = torch.round(weight.clamp(min=-128, max=127)).to(dtype=torch.int8)
+        output.weight[offset : offset + weight.shape[0]].copy_(weight)
+        output.weight_scale[offset : offset + weight.shape[0]].copy_(scale)
+        return
+
+    def apply(
+        self,
+        input_tensor: torch.Tensor,
+        weight_pack: WeightPack,
+        out: Optional[torch.Tensor] = None,
+        workspace: Optional[torch.Tensor] = None,
+        use_custom_tensor_mananger: bool = True,
+        bias: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        # TODO: Currently only vLLM backend is implemented
+        return self._apply_vllm(input_tensor, weight_pack, out, use_custom_tensor_mananger, bias)
+
+    def _apply_vllm(
+        self,
+        input_tensor: torch.Tensor,
+        weight_pack: WeightPack,
+        out: Optional[torch.Tensor],
+        use_custom_tensor_mananger: bool,
+        bias: Optional[torch.Tensor],
+    ) -> torch.Tensor:
+        qweight = weight_pack.weight.t()
+        weight_scale = weight_pack.weight_scale
+
+        x_q, x_scale, x_zp = vllm_ops.scaled_int8_quant(input_tensor, scale=None, azp=None, symmetric=True)
+
+        m = input_tensor.shape[0]
+        n = qweight.shape[1]
+
+        if out is None:
+            if use_custom_tensor_mananger:
+                out = self.cache_manager.alloc_tensor((m, n), input_tensor.dtype, device=input_tensor.device)
+            else:
+                out = torch.empty((m, n), dtype=input_tensor.dtype, device=input_tensor.device)
+
+        cutlass_scaled_mm(out, x_q, qweight, x_scale, weight_scale, bias)
+        return out
+
+    def create_weight(
+        self, out_dim: int, in_dim: int, dtype: torch.dtype, device_id: int, num_experts: int = 1
+    ) -> WeightPack:
+        expert_prefix = (num_experts,) if num_experts > 1 else ()
+        weight = torch.empty(expert_prefix + (out_dim, in_dim), dtype=torch.int8).cuda(device_id)
+        weight_scale = torch.empty(expert_prefix + (out_dim,), dtype=torch.float32).cuda(device_id)
+        return WeightPack(weight=weight, weight_scale=weight_scale)
diff --git a/lightllm/common/quantization/w8a8_quant.py b/lightllm/common/quantization/w8a8_quant.py
deleted file mode 100644
index 1728e799d..000000000
--- a/lightllm/common/quantization/w8a8_quant.py
+++ /dev/null
@@ -1,253 +0,0 @@
-import os
-import torch
-
-from lightllm.common.quantization.triton_quant.fp8.fp8w8a8_scaled_mm_per_token_kernel import fp8_scaled_mm_per_token
-from .quantize_method import QuantizationMethod
-from .registry import QUANTMETHODS
-import torch.nn.functional as F
-from typing import Optional, TYPE_CHECKING
-from lightllm.common.quantization.triton_quant.fp8.fp8act_quant_kernel import per_token_group_quant_fp8
-from lightllm.common.quantization.triton_quant.fp8.fp8w8a8_block_gemm_kernel import w8a8_block_fp8_matmul
-from lightllm.utils.vllm_utils import HAS_VLLM, vllm_ops, cutlass_scaled_mm
-from lightllm.utils.light_utils import HAS_LIGHTLLM_KERNEL, light_ops
-
-
-from .quantize_method import WeightPack
-
-if HAS_LIGHTLLM_KERNEL:
-
-    def scaled_fp8_quant(tensor, *args, **kwargs):
-        return light_ops.per_token_quant_bf16_fp8(tensor)
-
-else:
-    if HAS_VLLM:
-        scaled_fp8_quant = vllm_ops.scaled_fp8_quant
-
-LIGHTLLM_USE_TRITON_FP8_SCALED_MM = os.getenv("LIGHTLLM_USE_TRITON_FP8_SCALED_MM", "False").upper() in [
-    "ON",
-    "TRUE",
-    "1",
-]
-
-
-class BaseQuantizationMethod(QuantizationMethod):
-    def __init__(self):
-        super().__init__()
-        assert HAS_VLLM, "vllm are not installed, you can't use quant api of them."
-        from lightllm.common.basemodel.layer_infer.cache_tensor_manager import g_cache_manager
-
-        self.cache_manager = g_cache_manager
-
-    def quantize(self, weight: torch.Tensor, output: WeightPack, offset: int = 0) -> None:
-        raise NotImplementedError("Not implemented")
-
-    def apply(
-        self,
-        input_tensor: torch.Tensor,
-        weight_pack: WeightPack,
-        out: Optional[torch.Tensor] = None,
-        workspace: Optional[torch.Tensor] = None,
-        use_custom_tensor_mananger: bool = True,
-        bias: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
-        raise NotImplementedError("Not implemented")
-
-    @property
-    def method_name(self):
-        return "w8a8-base"
-
-    def create_weight(
-        self, out_dim: int, in_dim: int, dtype: torch.dtype, device_id: int, num_experts: int = 1
-    ) -> WeightPack:
-        raise NotImplementedError("Not implemented")
-
-
-@QUANTMETHODS.register(["vllm-w8a8", "w8a8"])
-class w8a8QuantizationMethod(BaseQuantizationMethod):
-    def __init__(self):
-        super().__init__()
-        self.has_weight_scale = True
-        self.has_weight_zero_point = False
-
-    def quantize(self, weight: torch.Tensor, output: WeightPack, offset: int = 0) -> None:
-        weight = weight.float().cuda(self.device_id_)
-        scale = weight.abs().max(dim=-1)[0] / 127
-        weight = weight / scale.reshape(-1, 1)
-        weight = torch.round(weight.clamp(min=-128, max=127)).to(dtype=torch.int8)
-        output.weight[offset : offset + weight.shape[0]].copy_(weight)
-        output.weight_scale[offset : offset + weight.shape[0]].copy_(scale)
-        return
-
-    def apply(
-        self,
-        input_tensor: torch.Tensor,
-        weight_pack: WeightPack,
-        out: Optional[torch.Tensor] = None,
-        workspace: Optional[torch.Tensor] = None,
-        use_custom_tensor_mananger: bool = True,
-        bias: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
-        input_scale = None
-        qweight = weight_pack.weight.t()
-        weight_scale = weight_pack.weight_scale
-        input_scale = None  # dynamic quantization for input tensor
-        x_q, x_scale, x_zp = vllm_ops.scaled_int8_quant(input_tensor, scale=input_scale, azp=None, symmetric=True)
-        m = input_tensor.shape[0]
-        n = qweight.shape[1]
-        if out is None:
-            if use_custom_tensor_mananger:
-                out = self.cache_manager.alloc_tensor((m, n), input_tensor.dtype, device=input_tensor.device)
-            else:
-                out = torch.empty((m, n), dtype=input_tensor.dtype, device=input_tensor.device)
-        cutlass_scaled_mm(out, x_q, qweight, x_scale, weight_scale, bias)
-        return out
-
-    @property
-    def method_name(self):
-        return "vllm-w8a8"
-
-    def create_weight(
-        self, out_dim: int, in_dim: int, dtype: torch.dtype, device_id: int, num_experts: int = 1
-    ) -> WeightPack:
-        expert_prefix = (num_experts,) if num_experts > 1 else ()
-        weight = torch.empty(expert_prefix + (out_dim, in_dim), dtype=torch.int8).cuda(device_id)
-        weight_scale = torch.empty(expert_prefix + (out_dim,), dtype=torch.float32).cuda(device_id)
-        return WeightPack(weight=weight, weight_scale=weight_scale)
-
-
-@QUANTMETHODS.register(["vllm-fp8w8a8", "fp8w8a8"])
-class FP8w8a8QuantizationMethod(BaseQuantizationMethod):
-    def __init__(self):
-        super().__init__()
-        self.is_moe = False
-        self.has_weight_scale = True
-        self.has_weight_zero_point = False
-
-    def quantize(self, weight: torch.Tensor, output: WeightPack, offset: int = 0) -> None:
-        if self.is_moe:
-            return self.quantize_moe(weight, output, offset)
-        qweight, weight_scale = scaled_fp8_quant(
-            weight.cuda(self.device_id_), scale=None, use_per_token_if_dynamic=True
-        )
-        output.weight[offset : offset + qweight.shape[0], :].copy_(qweight)
-        output.weight_scale[offset : offset + weight_scale.shape[0]].copy_(weight_scale.view(-1))
-        return
-
-    def quantize_moe(self, weight: torch.Tensor) -> WeightPack:
-        num_experts = weight.shape[0]
-        qweights = torch.empty_like(weight, dtype=torch.float8_e4m3fn).cuda(self.device_id_)
-        weight_scales = []
-        for i in range(num_experts):
-            qweight, weight_scale = scaled_fp8_quant(
-                weight[i].contiguous().cuda(self.device_id_), scale=None, use_per_token_if_dynamic=True
-            )
-            qweights[i] = qweight
-            weight_scales.append(weight_scale)
-        weight_scale = torch.stack(weight_scales, dim=0).contiguous()
-        return WeightPack(weight=qweights, weight_scale=weight_scale)
-
-    def apply(
-        self,
-        input_tensor: torch.Tensor,
-        weight_pack: WeightPack,
-        out: Optional[torch.Tensor] = None,
-        workspace: Optional[torch.Tensor] = None,
-        use_custom_tensor_mananger: bool = True,
-        bias: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
-        qweight = weight_pack.weight.t()
-        weight_scale = weight_pack.weight_scale
-        x_q, x_scale = scaled_fp8_quant(input_tensor, scale=None, scale_ub=None, use_per_token_if_dynamic=True)
-        m = input_tensor.shape[0]
-        n = qweight.shape[1]
-        if out is None:
-            if use_custom_tensor_mananger:
-                out = self.cache_manager.alloc_tensor((m, n), input_tensor.dtype, device=input_tensor.device)
-            else:
-                out = torch.empty((m, n), dtype=input_tensor.dtype, device=input_tensor.device)
-        if LIGHTLLM_USE_TRITON_FP8_SCALED_MM:
-            out = fp8_scaled_mm_per_token(x_q, qweight, x_scale, weight_scale, input_tensor.dtype, out)
-        else:
-            cutlass_scaled_mm(out, x_q, qweight, x_scale, weight_scale, bias)
-        return out
-
-    @property
-    def method_name(self):
-        return "vllm-fp8w8a8"
-
-    def create_weight(
-        self, out_dim: int, in_dim: int, dtype: torch.dtype, device_id: int, num_experts: int = 1
-    ) -> WeightPack:
-        expert_prefix = (num_experts,) if num_experts > 1 else ()
-        weight = torch.empty(expert_prefix + (out_dim, in_dim), dtype=torch.float8_e4m3fn).cuda(device_id)
-        weight_scale = torch.empty(expert_prefix + (out_dim,), dtype=torch.float32).cuda(device_id)
-        return WeightPack(weight=weight, weight_scale=weight_scale)
-
-
-@QUANTMETHODS.register(["vllm-fp8w8a8-b128", "fp8w8a8-b128"])
-class FP8w8a8B128QuantizationMethod(BaseQuantizationMethod):
-    def __init__(self):
-        super().__init__()
-        self.block_size = 128
-        self.weight_scale_suffix = "weight_scale_inv"
-        self.has_weight_scale = True
-        self.has_weight_zero_point = False
-
-    def quantize(self, weight: torch.Tensor, output: WeightPack, offset: int = 0) -> None:
-        from lightllm.common.quantization.triton_quant.fp8.fp8w8a8_block_quant_kernel import weight_quant
-
-        device = output.weight.device
-        weight, scale = weight_quant(weight.cuda(device), self.block_size)
-        output.weight[offset : offset + weight.shape[0], :].copy_(weight)
-        output.weight_scale[offset // self.block_size : offset + weight.shape[0] // self.block_size].copy_(scale)
-        return
-
-    def apply(
-        self,
-        input_tensor: torch.Tensor,
-        weight_pack: WeightPack,
-        out: Optional[torch.Tensor] = None,
-        workspace: Optional[torch.Tensor] = None,
-        use_custom_tensor_mananger: bool = True,
-        bias: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
-        qweight = weight_pack.weight.t()
-        weight_scale = weight_pack.weight_scale.t()
-        input_scale = None  # dynamic quantization for input tensor
-        m, k = input_tensor.shape
-        n = qweight.shape[1]
-        alloc_func = torch.empty if not use_custom_tensor_mananger else self.cache_manager.empty
-        if input_scale is None:
-            qinput_tensor, input_scale = per_token_group_quant_fp8(
-                input_tensor, self.block_size, dtype=qweight.dtype, alloc_func=alloc_func
-            )
-        if out is None:
-            out = alloc_func((m, n), dtype=input_tensor.dtype, device=input_tensor.device)
-        if n % 128 != 0:
-            w8a8_block_fp8_matmul(
-                qinput_tensor,
-                qweight,
-                input_scale,
-                weight_scale,
-                out,
-                (self.block_size, self.block_size),
-                dtype=input_tensor.dtype,
-            )
-        else:
-            input_scale = input_scale.t().contiguous().t()
-            cutlass_scaled_mm(out, qinput_tensor, qweight, input_scale, weight_scale, bias)
-        return out
-
-    @property
-    def method_name(self):
-        return "vllm-fp8w8a8-b128"
-
-    def create_weight(
-        self, out_dim: int, in_dim: int, dtype: torch.dtype, device_id: int, num_experts: int = 1
-    ) -> WeightPack:
-        expert_prefix = (num_experts,) if num_experts > 1 else ()
-        weight = torch.empty(expert_prefix + (out_dim, in_dim), dtype=torch.float8_e4m3fn).cuda(device_id)
-        weight_scale = torch.empty(
-            expert_prefix + (out_dim // self.block_size, in_dim // self.block_size), dtype=torch.float32
-        ).cuda(device_id)
-        return WeightPack(weight=weight, weight_scale=weight_scale)

From 96a15fa9b10347eebfd697ac08a401de81a67d54 Mon Sep 17 00:00:00 2001
From: shihaobai <1798930569@qq.com>
Date: Thu, 15 Jan 2026 05:49:32 +0000
Subject: [PATCH 17/65] refactor quantization (draft)

---
 .../layer_weights/meta_weights/norm_weight.py |  20 +-
 .../layer_weights/meta_weights/platform_op.py |  31 +-
 .../triton_kernel/dequantize_gemm_int4.py     | 649 ------------------
 .../triton_kernel/dequantize_gemm_int8.py     | 209 ------
 .../triton_kernel/{ => norm}/layernorm.py     |   0
 .../triton_kernel/{ => norm}/qk_norm.py       |   0
 .../triton_kernel/{ => norm}/rmsnorm.py       |   0
 .../triton_kernel/quantization}/__init__.py   |   0
 .../{ => quantization}/bmm_scaled_fp8.py      |   0
 .../quantization}/fp8act_quant_kernel.py      |   0
 .../fp8w8a8_block_gemm_kernel.py              |   0
 .../fp8w8a8_block_quant_kernel.py             |   0
 .../fp8w8a8_scaled_mm_per_token_kernel.py     |   0
 .../q_per_head_fp8_quant.py                   |   0
 .../triton_kernel/quantize_gemm_int8.py       | 376 ----------
 lightllm/common/quantization/__init__.py      | 105 +--
 .../common/quantization/{types => }/awq.py    |   0
 lightllm/common/quantization/backend.py       |  82 ---
 .../quantization/{types => }/fp8_block128.py  |   0
 .../quantization/{types => }/fp8_per_token.py |   0
 .../quantization/{types => }/no_quant.py      |   0
 .../quantization/triton_quant/fp8/__init__.py |   0
 .../common/quantization/types/__init__.py     |  13 -
 .../common/quantization/{types => }/w8a8.py   |  41 +-
 .../layer_weights/transformer_layer_weight.py |   1 -
 lightllm/server/api_cli.py                    |  12 +-
 26 files changed, 76 insertions(+), 1463 deletions(-)
 delete mode 100644 lightllm/common/basemodel/triton_kernel/dequantize_gemm_int4.py
 delete mode 100644 lightllm/common/basemodel/triton_kernel/dequantize_gemm_int8.py
 rename lightllm/common/basemodel/triton_kernel/{ => norm}/layernorm.py (100%)
 rename lightllm/common/basemodel/triton_kernel/{ => norm}/qk_norm.py (100%)
 rename lightllm/common/basemodel/triton_kernel/{ => norm}/rmsnorm.py (100%)
 rename lightllm/common/{quantization/triton_quant => basemodel/triton_kernel/quantization}/__init__.py (100%)
 rename lightllm/common/basemodel/triton_kernel/{ => quantization}/bmm_scaled_fp8.py (100%)
 rename lightllm/common/{quantization/triton_quant/fp8 => basemodel/triton_kernel/quantization}/fp8act_quant_kernel.py (100%)
 rename lightllm/common/{quantization/triton_quant/fp8 => basemodel/triton_kernel/quantization}/fp8w8a8_block_gemm_kernel.py (100%)
 rename lightllm/common/{quantization/triton_quant/fp8 => basemodel/triton_kernel/quantization}/fp8w8a8_block_quant_kernel.py (100%)
 rename lightllm/common/{quantization/triton_quant/fp8 => basemodel/triton_kernel/quantization}/fp8w8a8_scaled_mm_per_token_kernel.py (100%)
 rename lightllm/common/basemodel/triton_kernel/{ => quantization}/q_per_head_fp8_quant.py (100%)
 delete mode 100644 lightllm/common/basemodel/triton_kernel/quantize_gemm_int8.py
 rename lightllm/common/quantization/{types => }/awq.py (100%)
 delete mode 100644 lightllm/common/quantization/backend.py
 rename lightllm/common/quantization/{types => }/fp8_block128.py (100%)
 rename lightllm/common/quantization/{types => }/fp8_per_token.py (100%)
 rename lightllm/common/quantization/{types => }/no_quant.py (100%)
 delete mode 100644 lightllm/common/quantization/triton_quant/fp8/__init__.py
 delete mode 100644 lightllm/common/quantization/types/__init__.py
 rename lightllm/common/quantization/{types => }/w8a8.py (75%)

diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/norm_weight.py b/lightllm/common/basemodel/layer_weights/meta_weights/norm_weight.py
index 73b937b77..df12ec9b1 100644
--- a/lightllm/common/basemodel/layer_weights/meta_weights/norm_weight.py
+++ b/lightllm/common/basemodel/layer_weights/meta_weights/norm_weight.py
@@ -34,12 +34,12 @@ def _native_forward(
         variance = x_var.pow(2).mean(dim=-1, keepdim=True)
         x = x * torch.rsqrt(variance + eps)
         x = (x * self.weight).to(self.data_type_)
-        if out is None:
+        if out is not None:
             out.copy_(x)
             return out
         return x
 
-    def _cuda_forward(
+    def _triton_forward(
         self, input: torch.Tensor, eps: float, out: Optional[torch.Tensor] = None, alloc_func=torch.empty
     ) -> torch.Tensor:
         assert input.ndim == 2 and self.weight.ndim == 1
@@ -47,6 +47,12 @@ def _cuda_forward(
             out = alloc_func(input.shape, dtype=input.dtype, device=input.device)
         return rmsnorm_forward(x=input, weight=self.weight, eps=eps, out=out)
 
+    def _cuda_forward(
+        self, input: torch.Tensor, eps: float, out: Optional[torch.Tensor] = None, alloc_func=torch.empty
+    ) -> torch.Tensor:
+        # only triton implementation is supported for rmsnorm on cuda platform
+        return self._triton_forward(input=input, eps=eps, out=out, alloc_func=alloc_func)
+
     def __call__(
         self, input: torch.Tensor, eps: float, out: Optional[torch.Tensor] = None, alloc_func=torch.empty
     ) -> torch.Tensor:
@@ -80,12 +86,12 @@ def _native_forward(
         x = torch.nn.functional.layer_norm(
             input, normalized_shape=[self.dim], weight=self.weight, bias=self.bias, eps=eps
         )
-        if out is None:
+        if out is not None:
             out.copy_(x.to(self.data_type_))
             return out
         return x.to(self.data_type_)
 
-    def _cuda_forward(
+    def _triton_forward(
         self, input: torch.Tensor, eps: float, out: Optional[torch.Tensor] = None, alloc_func=torch.empty
     ) -> torch.Tensor:
         assert input.ndim == 2 and self.weight.ndim == 1
@@ -93,6 +99,12 @@ def _cuda_forward(
             out = alloc_func(input.shape, dtype=input.dtype, device=input.device)
         return layernorm_forward(x=input, weight=self.weight, bias=self.bias, eps=eps, out=out)
 
+    def _cuda_forward(
+        self, input: torch.Tensor, eps: float, out: Optional[torch.Tensor] = None, alloc_func=torch.empty
+    ) -> torch.Tensor:
+        # only triton implementation is supported for layernorm on cuda platform
+        return self._triton_forward(input=input, eps=eps, out=out, alloc_func=alloc_func)
+
     def __call__(
         self, input: torch.Tensor, eps: float, out: Optional[torch.Tensor] = None, alloc_func=torch.empty
     ) -> torch.Tensor:
diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/platform_op.py b/lightllm/common/basemodel/layer_weights/meta_weights/platform_op.py
index 127a543b2..1ba1610fc 100644
--- a/lightllm/common/basemodel/layer_weights/meta_weights/platform_op.py
+++ b/lightllm/common/basemodel/layer_weights/meta_weights/platform_op.py
@@ -3,6 +3,9 @@
 from typing import Optional, Callable, Any
 from lightllm.utils.device_utils import get_platform, Platform
 from lightllm.utils.envs_utils import get_env_start_args
+from lightllm.utils.log_utils import init_logger
+
+logger = init_logger(__name__)
 
 
 class PlatformAwareOp(ABC):
@@ -14,10 +17,12 @@ class PlatformAwareOp(ABC):
     def __init__(self):
         args = get_env_start_args()
         self.platform = get_platform(args.hardware_platform)
-        self.enable_torch_naive = args.enable_torch_naive
+        self.enable_torch_fallback = args.enable_torch_fallback
+        self.enable_triton_fallback = args.enable_triton_fallback
         self._forward = self._route_forward()
 
     def _route_forward(self) -> Callable:
+
         method_name_map = {
             Platform.CUDA: "_cuda_forward",
             Platform.ASCEND: "_ascend_forward",
@@ -33,14 +38,23 @@ def _route_forward(self) -> Callable:
             if callable(method):
                 return method
 
-        if self.enable_torch_naive:
+        if self.enable_triton_fallback:
+            if hasattr(self, "_triton_forward"):
+                return self._triton_forward
+            logger.warning(
+                f"No triton implementation found for {self.__class__.__name__} on {self.platform.name} platform. "
+                f"Please implement {self.__class__.__name__}_{self.platform.name}_triton_forward method, "
+                f"or set --enable_torch_fallback to use default implementation."
+            )
+
+        if self.enable_torch_fallback:
             return self._native_forward
 
-        # 如果都没有，抛出异常
+        # if no implementation found, raise error
         raise NotImplementedError(
-            f"No implementation found for platform {self.platform.name}. "
-            f"Please implement _{self.platform.name}_forward method, "
-            f"or set --enable_torch_naive to use default implementation."
+            f"No implementation found for {self.__class__.__name__} on {self.platform.name} platform. "
+            f"Please implement {self.__class__.__name__}_{self.platform.name}_forward method, "
+            f"or set --enable_torch_fallback to use default implementation."
         )
 
     @abstractmethod
@@ -50,3 +64,8 @@ def _native_forward(self, *args, **kwargs) -> Any:
     @abstractmethod
     def _cuda_forward(self, *args, **kwargs) -> Any:
         raise NotImplementedError("cuda forward must implement this method")
+
+    # Since Triton may be compatible with all hardware platforms in the future,
+    # so provide triton implementation as a fallback for all hardware platforms
+    def _triton_forward(self, *args, **kwargs) -> Any:
+        raise NotImplementedError("triton forward must implement this method")
diff --git a/lightllm/common/basemodel/triton_kernel/dequantize_gemm_int4.py b/lightllm/common/basemodel/triton_kernel/dequantize_gemm_int4.py
deleted file mode 100644
index 143d93b23..000000000
--- a/lightllm/common/basemodel/triton_kernel/dequantize_gemm_int4.py
+++ /dev/null
@@ -1,649 +0,0 @@
-import time
-
-import torch
-
-import triton
-import triton.language as tl
-
-
-@triton.autotune(
-	configs=[
-        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),
-        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),
-        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),
-        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),
-        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),
-        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),
-        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=2, num_warps=8),
-        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8}, num_stages=3, num_warps=8),
-        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),
-        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),
-        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),
-        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),
-        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),
-        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),
-        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8}, num_stages=2, num_warps=8),
-        triton.Config({'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_K': 128, 'GROUP_SIZE_M': 8}, num_stages=2, num_warps=4),
-        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 128, 'GROUP_SIZE_M': 8}, num_stages=3, num_warps=8), 
-    ],
-	key=['M', 'N', 'K', 'NO_GROUPS'],
-)
-@triton.jit
-def matmul4_kernel(
-	a_ptr, b_ptr, c_ptr,
-	scales_ptr, zeros_ptr,
-	M, N, K,
-	stride_am, stride_ak,
-	stride_bk, stride_bn,
-	stride_cm, stride_cn,
-	stride_scales_g, stride_scales_n,
-	stride_zeros_g, stride_zeros_n,
-	groupsize, NO_GROUPS: tl.constexpr,
-	BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
-	GROUP_SIZE_M: tl.constexpr,
-):
-    """
-    Compute the matrix multiplication C = A x B.
-    A is of shape (M, K) float16
-    B is of shape (K//8, N) int32
-    C is of shape (M, N) float16
-    scales is of shape (G, N) float16
-    zeros is of shape (G, N//8) int32
-    groupsize is an int specifying the size of groups for scales and zeros.
-    G is K // groupsize.
-    Set NO_GROUPS to groupsize == K, in which case G = 1 and the kernel is more efficient.
-    WARNING: This kernel assumes that K is a multiple of BLOCK_SIZE_K.
-    WARNING: This kernel assumes that N is a multiple of BLOCK_SIZE_N.
-    WARNING: This kernel assumes that groupsize is a multiple of BLOCK_SIZE_K.
-    """
-    bits = 4
-    infearure_per_bits = 8
-    pid = tl.program_id(axis=0)
-    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
-    num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
-    num_pid_k = tl.cdiv(K, BLOCK_SIZE_K)
-    num_pid_in_group = GROUP_SIZE_M * num_pid_n
-    group_id = pid // num_pid_in_group
-    first_pid_m = group_id * GROUP_SIZE_M
-    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
-    pid_m = first_pid_m + (pid % group_size_m)
-    pid_n = (pid % num_pid_in_group) // group_size_m    
-    offs_am = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    offs_bn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    offs_k = tl.arange(0, BLOCK_SIZE_K)
-    a_ptrs = a_ptr + (offs_am[:, None] * stride_am + offs_k[None, :] * stride_ak)   # (BLOCK_SIZE_M, BLOCK_SIZE_K)
-    a_mask = (offs_am[:, None] < M)
-    # b_ptrs is set up such that it repeats elements along the K axis 8 times
-    b_ptrs = b_ptr + ((offs_k[:, None] // infearure_per_bits) * stride_bk + offs_bn[None, :] * stride_bn)   # (BLOCK_SIZE_K, BLOCK_SIZE_N)
-    scales_ptrs = scales_ptr + offs_bn * stride_scales_n   # (BLOCK_SIZE_N,)
-    # zeros_ptrs is set up such that it repeats elements along the N axis 8 times
-    zeros_ptrs = zeros_ptr + ((offs_bn // infearure_per_bits) * stride_zeros_n)   # (BLOCK_SIZE_N,)
-    # shifter is used to extract the 4 bits of each element in the 32-bit word from B and zeros
-    shifter = (offs_k % infearure_per_bits) * bits
-    zeros_shifter = (offs_bn % infearure_per_bits) * bits
-    # If G == 1, scales and zeros are the same for all K, so we can load them once
-    if NO_GROUPS:
-        # Fetch scales and zeros; these are per-outfeature and thus reused in the inner loop
-        scales = tl.load(scales_ptrs)  # (BLOCK_SIZE_N,)
-        zeros = tl.load(zeros_ptrs)  # (BLOCK_SIZE_N,), each element is repeated 8 times, int32	
-        # Unpack zeros
-        zeros = (zeros >> zeros_shifter) & 0xF  # (BLOCK_SIZE_N,) int32
-        # zeros = (zeros + 1) * scales  # (BLOCK_SIZE_N,) float16
-        zeros = zeros * scales
-    # Now calculate a block of output of shape (BLOCK_SIZE_M, BLOCK_SIZE_N)
-    # M is along the batch dimension, N is along the outfeatures dimension, K is along the infeatures dimension
-    # So this loop is along the infeatures dimension (K)
-    # It's calculating BLOCK_SIZE_M batches in parallel, and for each batch, BLOCK_SIZE_N outfeatures in parallel
-    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
-    for k in range(0, num_pid_k):
-        a = tl.load(a_ptrs, mask=a_mask, other=0.)   # (BLOCK_SIZE_M, BLOCK_SIZE_K)
-        b = tl.load(b_ptrs)  # (BLOCK_SIZE_K, BLOCK_SIZE_N), but repeated
-        if not NO_GROUPS:
-            g_id = k // (groupsize // BLOCK_SIZE_K)
-            ptr = scales_ptrs + g_id * stride_scales_g
-            scales = tl.load(ptr)  # (BLOCK_SIZE_N,)
-            ptr = zeros_ptrs + g_id * stride_zeros_g   # (BLOCK_SIZE_N,)
-            zeros = tl.load(ptr)  # (BLOCK_SIZE_N,), each element is repeated 8 times, int32	
-            # Unpack zeros
-            zeros = (zeros >> zeros_shifter) & 0xF  # (BLOCK_SIZE_N,) int32
-            zeros = (zeros) * scales  # (BLOCK_SIZE_N,) float16	
-        # Now we need to unpack b (which is 4-bit values) into 32-bit values
-        b = (b >> shifter[:, None]) & 0xF  # Extract the 4-bit values
-        b = b * scales[None, :] - zeros[None, :]  # Scale and shift
-        # print("data type", a, b)
-        accumulator += tl.dot(a, b.to(a.dtype))
-        a_ptrs += BLOCK_SIZE_K * stride_ak
-        b_ptrs += (BLOCK_SIZE_K // infearure_per_bits) * stride_bk  
-    c = accumulator.to(c_ptr.dtype.element_ty)  
-    # Store the result
-    offs_cm = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    c_ptrs = c_ptr + stride_cm * offs_cm[:, None] + stride_cn * offs_cn[None, :]
-    c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < N)
-    tl.store(c_ptrs, accumulator, mask=c_mask)
-
-
-def matmul_dequantize_int4_gptq(x: torch.FloatTensor, qweight: torch.IntTensor, scales: torch.FloatTensor, qzeros: torch.IntTensor, group_size, output=None) -> torch.FloatTensor:
-	"""
-	Compute the matrix multiplication C = A x B + bias.
-	Where B is quantized using GPTQ and groupsize = -1 into 4-bit values.
-
-	A is of shape (..., K) float16
-	qweight is of shape (K//8, N) int32
-	scales is of shape (G, N) float16
-	qzeros is of shape (G, N//8) int32
-	bias is of shape (1, N) float16
-
-	groupsize is the number of infeatures in each group.
-	G = K // groupsize
-
-	Returns C of shape (..., N) float16
-	"""
-	assert x.shape[-1] == (qweight.shape[0] * 8), "A must be a multiple of 8 in the last dimension"
-	assert x.is_contiguous(), "A must be contiguous"
-
-	M, K = x.shape
-	N = qweight.shape[1]
-	# This is based on the possible BLOCK_SIZE_Ks
-	# assert K % 16 == 0 and K % 32 == 0 and K % 64 == 0 and K % 128 == 0, "K must be a multiple of 16, 32, 64, and 128"
-	# # This is based on the possible BLOCK_SIZE_Ns
-	# assert N % 16 == 0 and N % 32 == 0 and N % 64 == 0 and N % 128 == 0 and N % 256 == 0, "N must be a multiple of 16, 32, 64, 128, and 256"
-	# # This is based on the possible BLOCK_SIZE_Ks
-	# assert groupsize % 32 == 0 and groupsize % 64 == 0 and groupsize % 128 == 0, "groupsize must be a multiple of 32, 64, and 128"
-
-	# output = torch.empty((M, N), device='cuda', dtype=torch.float16)
-	if output is None:
-		inplace = False
-		output = torch.empty((M, N), device=x.device, dtype=x.dtype)
-	else:
-		inplace = True
-
-	grid = lambda META: (
-		triton.cdiv(M, META['BLOCK_SIZE_M']) * triton.cdiv(N, META['BLOCK_SIZE_N']),
-	)
-	matmul4_kernel[grid](
-		x, qweight, output,
-		scales, qzeros,
-		M, N, K,
-		x.stride(0), x.stride(1),
-		qweight.stride(0), qweight.stride(1),
-		output.stride(0), output.stride(1),
-		scales.stride(0), scales.stride(1),
-		qzeros.stride(0), qzeros.stride(1),
-		group_size, group_size == K,
-    )
-	# return output
-	if not inplace:
-		return output
-
-
-@triton.autotune(
-	configs=[
-		triton.Config({'SPLIT_K': 1, 'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),
-		triton.Config({'SPLIT_K': 1, 'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),
-		triton.Config({'SPLIT_K': 1, 'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),
-		triton.Config({'SPLIT_K': 1, 'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=2, num_warps=8),
-		triton.Config({'SPLIT_K': 1, 'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8}, num_stages=3, num_warps=8),
-		triton.Config({'SPLIT_K': 1, 'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_K': 128, 'GROUP_SIZE_M': 8}, num_stages=2, num_warps=4),
-		triton.Config({'SPLIT_K': 1, 'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_K': 256, 'GROUP_SIZE_M': 8}, num_stages=2, num_warps=4),
-        triton.Config({'SPLIT_K': 1, 'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_K': 256, 'GROUP_SIZE_M': 16}, num_stages=2, num_warps=4),
-        triton.Config({'SPLIT_K': 1, 'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_K': 512, 'GROUP_SIZE_M': 16}, num_stages=2, num_warps=4),
-		triton.Config({'SPLIT_K': 2, 'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),
-		triton.Config({'SPLIT_K': 2, 'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),
-		triton.Config({'SPLIT_K': 2, 'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),
-		triton.Config({'SPLIT_K': 2, 'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=2, num_warps=8),
-		triton.Config({'SPLIT_K': 2, 'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8}, num_stages=3, num_warps=8),
-		triton.Config({'SPLIT_K': 2, 'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_K': 128, 'GROUP_SIZE_M': 8}, num_stages=2, num_warps=4),
-		triton.Config({'SPLIT_K': 2, 'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_K': 256, 'GROUP_SIZE_M': 8}, num_stages=2, num_warps=4),
-        triton.Config({'SPLIT_K': 2, 'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_K': 256, 'GROUP_SIZE_M': 16}, num_stages=2, num_warps=4),
-        triton.Config({'SPLIT_K': 2, 'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_K': 512, 'GROUP_SIZE_M': 16}, num_stages=2, num_warps=4),
-	    
-        triton.Config({'SPLIT_K': 1, 'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 16, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),
-		triton.Config({'SPLIT_K': 1, 'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 16, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),
-		triton.Config({'SPLIT_K': 1, 'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 16, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),
-		triton.Config({'SPLIT_K': 1, 'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 16, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=2, num_warps=8),
-		triton.Config({'SPLIT_K': 1, 'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 16, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8}, num_stages=3, num_warps=8),
-		triton.Config({'SPLIT_K': 1, 'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_N': 16, 'BLOCK_SIZE_K': 128, 'GROUP_SIZE_M': 8}, num_stages=2, num_warps=4),
-		triton.Config({'SPLIT_K': 1, 'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_N': 16, 'BLOCK_SIZE_K': 256, 'GROUP_SIZE_M': 8}, num_stages=2, num_warps=4),
-        triton.Config({'SPLIT_K': 1, 'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_N': 16, 'BLOCK_SIZE_K': 256, 'GROUP_SIZE_M': 16}, num_stages=2, num_warps=4),
-        triton.Config({'SPLIT_K': 1, 'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_N': 16, 'BLOCK_SIZE_K': 512, 'GROUP_SIZE_M': 16}, num_stages=2, num_warps=4),
-		triton.Config({'SPLIT_K': 2, 'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 16, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),
-		triton.Config({'SPLIT_K': 2, 'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 16, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),
-		triton.Config({'SPLIT_K': 2, 'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 16, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),
-		triton.Config({'SPLIT_K': 2, 'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 16, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=2, num_warps=8),
-		triton.Config({'SPLIT_K': 2, 'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 16, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8}, num_stages=3, num_warps=8),
-		triton.Config({'SPLIT_K': 2, 'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_N': 16, 'BLOCK_SIZE_K': 128, 'GROUP_SIZE_M': 8}, num_stages=2, num_warps=4),
-		triton.Config({'SPLIT_K': 2, 'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_N': 16, 'BLOCK_SIZE_K': 256, 'GROUP_SIZE_M': 8}, num_stages=2, num_warps=4),
-        triton.Config({'SPLIT_K': 2, 'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_N': 16, 'BLOCK_SIZE_K': 256, 'GROUP_SIZE_M': 16}, num_stages=2, num_warps=4),
-        triton.Config({'SPLIT_K': 2, 'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_N': 16, 'BLOCK_SIZE_K': 512, 'GROUP_SIZE_M': 16}, num_stages=2, num_warps=4),
-		
- ],
-	key=['M', 'N', 'K'],
-    reset_to_zero=['c_ptr']
-)
-@triton.jit
-def matmul_kernel(
-    a_ptr, b_ptr, c_ptr,
-    bs_ptr, bzp_ptr,
-    M, N, K,
-    stride_am, stride_ak,
-    stride_bk, stride_bn,
-    stride_cm, stride_cn,
-    stride_bsk, stride_bsn,
-    stride_bzpk, stride_bzpn,
-    group_size,
-    BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
-    GROUP_SIZE_M: tl.constexpr, SPLIT_K: tl.constexpr
-    ):
-    """
-    assert K % (BLOCK_SIZE_K * SPLIT_K) == 0
-    """
-    pid = tl.program_id(axis=0)
-    pid_sp_k = tl.program_id(axis=1)
-    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
-    num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
-    num_pid_k = tl.cdiv(K, BLOCK_SIZE_K)
-    num_pid_in_group = GROUP_SIZE_M * num_pid_n
-    group_id = pid // num_pid_in_group
-    first_pid_m = group_id * GROUP_SIZE_M
-    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
-    pid_m = first_pid_m + (pid % group_size_m)
-    pid_n = (pid % num_pid_in_group) // group_size_m    
-    offs_am = (pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M
-    offs_bn = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)) % N
-    offs_k = pid_sp_k * BLOCK_SIZE_K + tl.arange(0, BLOCK_SIZE_K)
-
-    # [BLOCK_M, BLOCK_K]
-    a_ptrs = a_ptr + offs_am[:, None] * stride_am + offs_k[None, :] * stride_ak
-    # [BLOCK_K, BLOCK_N] but repeated 8 times in N
-    b_ptrs = b_ptr + (offs_k[:, None] // 8) * stride_bk + offs_bn[None, :] * stride_bn
-    # tl.static_print("shape", a_ptrs, b_ptrs, bs_ptrs, bzp_ptrs)
-    # -----------------------------------------------------------
-    # Iterate to compute a block of the C matrix.
-    # We accumulate into a `[BLOCK_SIZE_M, BLOCK_SIZE_N]` block
-    # of fp32 values for higher accuracy.
-    # `accumulator` will be converted back to fp16 after the loop.
-    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
-    for k in range(0, tl.cdiv(K, BLOCK_SIZE_K * SPLIT_K)):
-        # Load the next block of A and B.
-        # [BLOCK_K, BLOCK_N] but repeated group_size times in K 
-        bs_ptrs = bs_ptr + ((offs_k[:, None] + k * BLOCK_SIZE_K * SPLIT_K) // group_size) * stride_bsk \
-            + offs_bn[None, :] * stride_bsn
-        # [BLOCK_K, BLOCK_N] but repeated in K and N
-        bzp_ptrs = bzp_ptr + ((offs_k[:, None] + k * BLOCK_SIZE_K * SPLIT_K) // group_size) * stride_bzpk \
-            + (offs_bn[None, :] // 8) * stride_bzpn
-        b_shift_bits = (offs_k[:, None] % 8) * 4 # assert BLOCK_SIZE_K % 8 == 0
-        bzp_shift_bits = (offs_bn[None, :] % 8) * 4
-        a = tl.load(a_ptrs)
-        b = tl.load(b_ptrs)
-        bs = tl.load(bs_ptrs)
-        bzp = tl.load(bzp_ptrs)
-        # We accumulate along the K dimension.
-        int_b = (b >> b_shift_bits) & 0xF
-        int_bzp = (bzp >> bzp_shift_bits) & 0xF
-        b = ((int_b - int_bzp) * bs).to(a.dtype)
-        accumulator += tl.dot(a, b.to(a.dtype))
-        # Advance the ptrs to the next K block.
-        a_ptrs += BLOCK_SIZE_K * SPLIT_K * stride_ak
-        b_ptrs += (BLOCK_SIZE_K * SPLIT_K * stride_bk // 8)  # assert BLOCK_SIZE_K % 8 == 0
-    # You can fuse arbitrary activation functions here
-    # while the accumulator is still in FP32!
-    c = accumulator.to(c_ptr.dtype.element_ty)
-    # -----------------------------------------------------------
-    # Write back the block of the output matrix C with masks.
-    offs_cm = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    c_ptrs = c_ptr + stride_cm * offs_cm[:, None] + stride_cn * offs_cn[None, :]
-    c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < N)
-    if SPLIT_K == 1:
-        tl.store(c_ptrs, c, mask=c_mask)
-    else:
-        tl.atomic_add(c_ptrs, c, mask=c_mask)
-
-
-def matmul_dequantize_int4_s2(x: torch.FloatTensor, qweight: torch.IntTensor, scales: torch.FloatTensor, qzeros: torch.IntTensor, group_size: int = 128, output=None) -> torch.FloatTensor:
-    """
-    """
-    assert x.is_contiguous(), "A must be contiguous"
-    assert qweight.is_contiguous(), "B must be contiguous"  
-    M, K = x.shape
-    N = scales.shape[1]
-    if output is None:
-        output = torch.zeros((M, N), device=x.device, dtype=x.dtype)  
-    grid = lambda META: (
-        triton.cdiv(M, META['BLOCK_SIZE_M']) * triton.cdiv(N, META['BLOCK_SIZE_N']),
-        META['SPLIT_K'],
-    )
-    matmul_kernel[grid](
-        x, qweight, output,
-        scales, qzeros,
-        M, N, K,
-        x.stride(0), x.stride(1),
-        qweight.stride(0), qweight.stride(1),
-        output.stride(0), output.stride(1),
-        scales.stride(0), scales.stride(1),
-        qzeros.stride(0), qzeros.stride(1),
-        group_size,
-    )
-    return output
-
-
-@triton.autotune(
-    configs=[
-        triton.Config({'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_K': 64}, num_stages=4, num_warps=4),
-        triton.Config({'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_K': 32}, num_stages=4, num_warps=4),
-        triton.Config({'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_K': 32}, num_stages=5, num_warps=2),
-        triton.Config({'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_K': 64}, num_stages=5, num_warps=2),
-        triton.Config({'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32}, num_stages=4, num_warps=4),
-        triton.Config({'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_K': 32}, num_stages=4, num_warps=4),
-        triton.Config({'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_K': 32}, num_stages=5, num_warps=2),
-        triton.Config({'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32}, num_stages=5, num_warps=2),
-    ],
-    key=['K', 'N'],
-)
-@triton.jit
-def dequantize_kernel(
-    # Pointers to matrices
-    b_ptr, b_scale_ptr, b_zp_ptr, fpb_ptr,
-    # Matrix dimensions
-    K, N, group_size,
-    stride_bk, stride_bn,
-    stride_bsk, stride_bsn,
-    stride_bzpk, stride_bzpn,
-    stride_fpbk, stride_fpbn,
-    # Meta-parameters
-    BLOCK_SIZE_K: tl.constexpr, BLOCK_SIZE_N: tl.constexpr,
-):
-    """Dequantize tile [BLOCK_SIZE_K, BLOCK_SIZE_N] in full precision.
-    We should assert BLOCK_SIZE_N % 8 == 0.
-    weight[K // 8, N], scale[K // group_size, N], zp[K // group_size, N // group_size]
-    """
-    k_block_idx = tl.program_id(axis=0)
-    n_block_idx = tl.program_id(axis=1)
-    offs_k = k_block_idx * BLOCK_SIZE_K + tl.arange(0, BLOCK_SIZE_K)
-    offs_n = n_block_idx * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    fpb_offs = offs_k[:, None] * stride_fpbk + offs_n[None, :] * stride_fpbn
-    b_offs = (offs_k[:, None] // 8) * stride_bk + offs_n[None, :] * stride_bn
-    bzp_offs = (offs_k[:, None] // group_size) * stride_bzpk + (offs_n[None, :] // 8) * stride_bzpn
-    bs_offs = (offs_k[:, None] // group_size) * stride_bsk + offs_n[None, :] * stride_bsn
-    n_mask = offs_n[None, :] < N
-    k_mask = offs_k[:, None] < K
-    mask = n_mask & k_mask
-    int32_b = tl.load(b_ptr + b_offs, mask=mask, other=0.0)
-    zp_b = tl.load(b_zp_ptr + bzp_offs, mask=mask, other=0.0)
-    scale_b = tl.load(b_scale_ptr + bs_offs, mask=mask, other=0.0)
-    b_shift = (offs_k[:, None] % 8) * 4
-    bzp_shift = (offs_n[None, :] % 8) * 4
-    fp_weight = (((int32_b >> b_shift) & 0xF) - ((zp_b >> bzp_shift) & 0xF)) * scale_b
-    tl.store(fpb_ptr + fpb_offs, fp_weight, mask=mask)
-
-
-def dequantize_int4(b, b_scale, b_zero_point, device, dtype, group_size):
-    Kw, N = b.shape
-    K = Kw * 8
-    fp_b = torch.ones((K, N), device=device, dtype=dtype)
-    grid = lambda META: (
-        triton.cdiv(K, META['BLOCK_SIZE_K']),
-        triton.cdiv(N, META['BLOCK_SIZE_N']), 
-    )
-    dequantize_kernel[grid](
-        b, b_scale, b_zero_point, fp_b,
-        K, N, group_size,
-        b.stride(0), b.stride(1),
-        b_scale.stride(0), b_scale.stride(1),
-        b_zero_point.stride(0), b_zero_point.stride(1),
-        fp_b.stride(0), fp_b.stride(1)
-    )
-    return fp_b
-
-
-def matmul_dequantize_int4_s1(a, b, b_scale, b_zero_point, group_size=128, out=None):
-    """
-    Matmul dequantize int4 s1 dequantize weight to `fp_b` and do fp16 torch.mm,
-    this is for `prefill` stage, since weight size is fixed so is dequantize overhead,
-    perfill stage have more tokens to amortize dequant cost.
-    """
-    assert a.is_contiguous(), "Matrix A must be contiguous"
-    # assert b.is_contiguous(), "Matrix B must be contiguous"
-    M, K = a.shape
-    Kw, N = b.shape
-    if out is None:
-        # Allocates output.
-        out = torch.empty((M, N), device=a.device, dtype=a.dtype)
-    fp_b = dequantize_int4(b, b_scale, b_zero_point, a.device, a.dtype, group_size)
-    torch.mm(a, fp_b, out=out)
-    fp_b = None
-    return out
-
-
-def quantize_int4(weight, group_size=128, tp_rank=0):
-    # Weight shape: [H1 // 8, H2]
-    # Scale shape: [H1 // group_size, H2]
-    # zero_pint shape: [H1 // group_size, H2 // 8]
-
-    weight = weight.transpose(1, 0)
-    h1, h2 = weight.shape
-    assert h1 % 8 == 0 and h2 % 8 == 0, "H1 {} H2 {}".format(h1, h2)
-    assert h2 % group_size == 0, "H1 {} H2 {}".format(h1, h2)
-    weight = weight.contiguous().view(-1, group_size).cuda(tp_rank)
-    weight_max = weight.amax(-1, keepdim=True)
-    weight_max = torch.where(weight_max < 0, 0, weight_max)
-    weight_min = weight.amin(-1, keepdim=True)
-    weight_min = torch.where(weight_min > 0, 0, weight_min)
-    weight_range = weight_max - weight_min 
-    scale = weight_range / (2 ** 4 - 1)
-    zero_point = (-weight_min / scale).round().clamp(0, 15).to(torch.int32)
-    weight = (weight / scale + zero_point).round().clamp(0, 15).to(torch.int32).view(h1, h2)
-    int_weight = torch.empty(h1, h2 // 8).to(torch.int32).to(weight.device)
-    int_zero_point = torch.zeros(h1 // 8, h2 // group_size).to(torch.int32).to(weight.device)
-    zero_point = zero_point.view(h1, -1)
-    scale = scale.view(h1, -1)
-    # pack 8 int4 in an int32 number.
-    # Weight pack in row.
-    for pack in range(0, h2, 8):
-        for i in range(8):
-            int_weight[:, pack // 8] += weight[:, pack + i] << (i * 4)
-    # zero point pack in col.
-    for pack in range(0, h1, 8):
-        for i in range(8):
-            int_zero_point[pack // 8, :] += zero_point[pack + i, :] << (i * 4)
-    '''
-    fp_weight = torch.zeros(h1, h2).half().to(weight.device)
-    for pack in range(0, h1 // 8):
-        for i in range(8):
-            fp_weight[pack * 8 + i, :] = \
-                ((int_weight[pack, :] << (28 - i * 4) >> 28) + 16) % 16
-    print((fp_weight - weight).abs().sum())
-
-    fp_zp = torch.zeros(zero_point.shape).half().to(zero_point.device)
-    for pack in range(0, h1 // 8):
-        for i in range(8):
-            fp_zp[pack * 8 + i, :] = \
-                (int_zero_point[pack, :] >> (i * 4)) & 15
-
-    print((fp_zp - zero_point).abs().sum())
-    '''
-    weight = None
-    return int_weight.transpose(1, 0).contiguous(), scale.transpose(1, 0).contiguous(), int_zero_point.transpose(1, 0).contiguous(), group_size
-
-
-def unpack_int4(weight, scale, zp):
-    """
-    Test function to verify quantize int4 is correct.
-    Will not be used in model inference.
-    """
-    weight = weight.transpose(1, 0)
-    scale = scale.transpose(1, 0)
-    zp = zp.transpose(1, 0)
-    h1, h2 = weight.shape
-    group_size = h2 * 8 // scale.shape[1]
-    group_num = scale.shape[1]
-    fp_weight = torch.zeros(h1, h2 * 8).half().to(weight.device)
-    fp_zero_point = torch.zeros(h1, group_num).to(weight.device)
-    for pack in range(0, h2):
-        for i in range(8):
-            fp_weight[:, pack * 8 + i] = (weight[:, pack] >> (i * 4)) & 0xF
-    for pack in range(0, h1 // 8):
-        for i in range(8):
-            fp_zero_point[pack * 8 + i, :] = (zp[pack, :] >> (i * 4)) & 0xF
-    for g in range(group_num):
-        fp_weight[:, g * group_size:(g + 1) * group_size] = (fp_weight[:, g * group_size:(g + 1) * group_size] - \
-                                                             fp_zero_point[:, g].unsqueeze(1)) * scale[:, g].unsqueeze(1)
-    return fp_weight.transpose(1, 0)
-
-
-def test_int4(M, K, N):
-    import time
-
-    print("M: {} K: {} N: {}".format(M, K, N))
-    a = torch.randn((M, K), device='cuda', dtype=torch.float16)
-    b = torch.randn((K, N), device='cuda', dtype=torch.float16)
-    int_b, b_scale, b_zero_point, _ = quantize_int4(b)
-    for _ in range(10):
-        triton_output = matmul_dequantize_int4_s1(a, int_b, b_scale, b_zero_point)
-    torch.cuda.synchronize()
-    iters = 512
-    t1 = time.time()
-    for _ in range(iters):
-        triton_output = matmul_dequantize_int4_s1(a, int_b, b_scale, b_zero_point)
-    torch.cuda.synchronize()
-    t2 = time.time()
-    triton_time = t2 - t1
-    print("Triton time cost", (t2 - t1))
-    for _ in range(10):
-        torch_output = torch.matmul(a, b)
-    torch.cuda.synchronize()
-    iters = 512
-    t1 = time.time()
-    for _ in range(iters):
-        torch_output = torch.matmul(a, b)
-    torch.cuda.synchronize()
-    t2 = time.time()
-    torch_time = t2 - t1
-    print("Torch time cost", (t2 - t1))
-    return triton_time, torch_time
-
-
-def test_correct_int4_s1(M=32, K=4096, N=4096):
-    group_size = 128
-    a = torch.randn((M, K), device='cuda', dtype=torch.float16)
-    b = torch.randn((K, N), device='cuda', dtype=torch.float16)
-    int_b, b_scale, b_zero_point, _ = quantize_int4(b, group_size=group_size)
-    cos = torch.nn.CosineSimilarity(0)
-    fp_weight = dequantize_int4(int_b, b_scale, b_zero_point, a.device, a.dtype, group_size)
-    print("Quantize cos", cos(fp_weight.flatten().to(torch.float32), b.flatten().to(torch.float32)))
-    triton_output = matmul_dequantize_int4_s1(a, int_b, b_scale, b_zero_point, group_size)
-    torch_output = torch.matmul(a, b)
-    print(f"triton_output={triton_output}")
-    print(f"torch_output={torch_output}")
-    print("Output cos", cos(triton_output.flatten().to(torch.float32), torch_output.flatten().to(torch.float32)))
-
-
-def test_correct_int4_s2(M=32, K=4096, N=4096):
-    group_size = 128
-    a = torch.randn((M, K), device='cuda', dtype=torch.float16)
-    b = torch.randn((K, N), device='cuda', dtype=torch.float16)
-    int_b, b_scale, b_zero_point, _ = quantize_int4(b, group_size=group_size)
-    cos = torch.nn.CosineSimilarity(0)
-    fp_weight = unpack_int4(int_b, b_scale, b_zero_point)
-    print("Quantize cos", cos(fp_weight.flatten().to(torch.float32), b.flatten().to(torch.float32)))
-    triton_output = matmul_dequantize_int4_s2(a, int_b, b_scale, b_zero_point, group_size)
-    torch_output = torch.matmul(a, b)
-    print(f"triton_output={triton_output}")
-    print(f"torch_output={torch_output}")
-    print("Output cos", cos(triton_output.flatten().to(torch.float32), torch_output.flatten().to(torch.float32)))
-
-
-def test_correct_int4_gptq(M=32, K=4096, N=4096):
-    group_size = 128
-    a = torch.randn((M, K), device='cuda', dtype=torch.float16)
-    b = torch.randn((K, N), device='cuda', dtype=torch.float16)
-    int_b, b_scale, b_zero_point, _ = quantize_int4(b, group_size=group_size)
-    cos = torch.nn.CosineSimilarity(0)
-    fp_weight = unpack_int4(int_b, b_scale, b_zero_point)
-    print("Quantize cos", cos(fp_weight.flatten().to(torch.float32), b.flatten().to(torch.float32)))
-    triton_output = matmul_dequantize_int4_gptq(a, int_b, b_scale, b_zero_point, group_size)
-    torch_output = torch.matmul(a, b)
-    print(f"triton_output={triton_output}")
-    print(f"torch_output={torch_output}")
-    print("Output cos", cos(triton_output.flatten().to(torch.float32), torch_output.flatten().to(torch.float32)))
-
-
-@triton.testing.perf_report(
-    triton.testing.Benchmark(
-        x_names=['M'],  # Argument names to use as an x-axis for the plot
-        x_vals=[4, 8, 16, 32, 64, 128] + [
-            128 * i for i in range(2, 33, 2)
-        ],  # Different possible values for `x_name`
-        line_arg='provider',  # Argument name whose value corresponds to a different line in the plot
-        # Possible values for `line_arg`
-        line_vals=['cublas', 'triton-s1', 'dequantize', 'triton-s2', 'triton-gptq'],
-        # Label name for the lines
-        line_names=["cuBLAS", "Triton-s1", "Dequant(GB/s)", "Triton-s2", "Triton-gptq"],
-        # Line styles
-        styles=[('green', '-'), ('blue', '-'), ('red', '-'), ('purple', '-'), ('yellow', '-')],
-        ylabel="TFLOPS",  # Label name for the y-axis
-        plot_name="matmul-performance",  # Name for the plot, used also as a file name for saving the plot.
-        args={},
-    )
-)
-def benchmark(M, provider):
-    K = 4096
-    N = 4096
-    a = torch.randn((M, K), device='cuda', dtype=torch.float16)
-    b = torch.randn((K, N), device='cuda', dtype=torch.float16)
-    quantiles = [0.5, 0.2, 0.8]
-    if provider == 'cublas':
-        ms, min_ms, max_ms = triton.testing.do_bench(lambda: torch.matmul(a, b), quantiles=quantiles)
-        perf = lambda ms: 2 * M * N * K * 1e-12 / (ms * 1e-3)
-    if provider == 'triton-s1':
-        intb, b_scale, bzp, _ = quantize_int4(b, group_size=64)
-        ms, min_ms, max_ms = triton.testing.do_bench(lambda: matmul_dequantize_int4_s1(a, intb, b_scale, bzp, 64), quantiles=quantiles)
-        perf = lambda ms: 2 * M * N * K * 1e-12 / (ms * 1e-3)
-    if provider == 'triton-s2':
-        intb, b_scale, bzp, _ = quantize_int4(b, group_size=64)
-        ms, min_ms, max_ms = triton.testing.do_bench(lambda: matmul_dequantize_int4_s2(a, intb, b_scale, bzp, 64), quantiles=quantiles)
-        perf = lambda ms: 2 * M * N * K * 1e-12 / (ms * 1e-3)
-    if provider == 'dequantize':
-        intb, b_scale, bzp, _ = quantize_int4(b, group_size=64)
-        ms, min_ms, max_ms = triton.testing.do_bench(lambda: dequantize_int4(intb, b_scale, bzp, 'cuda', torch.float16, 64), quantiles=quantiles)        
-        perf = lambda ms: 2 * M * K * 1e-9 / (ms * 1e-3)
-    if provider == 'triton-gptq':
-        intb, b_scale, bzp, _ = quantize_int4(b, group_size=64)
-        ms, min_ms, max_ms = triton.testing.do_bench(lambda: matmul_dequantize_int4_gptq(a, intb, b_scale, bzp, 64), quantiles=quantiles)
-        perf = lambda ms: 2 * M * N * K * 1e-12 / (ms * 1e-3)
-    return perf(ms), perf(max_ms), perf(min_ms)
-
-
-def test_model_layer(bs, sqe_len, hidden, inter, tp):
-    st1 = 0
-    st2 = 0
-    t1, t2 = test_int4(bs * sqe_len, hidden, hidden * 3 // tp)
-    st1 += t1
-    st2 += t2
-    t1, t2 = test_int4(bs * sqe_len, hidden // tp, hidden)
-    st1 += t1
-    st2 += t2
-    t1, t2 = test_int4(bs * sqe_len, hidden, inter * 2 // tp)
-    st1 += t1
-    st2 += t2
-    t1, t2 = test_int4(bs * sqe_len, inter // tp, hidden)
-    st1 += t1
-    st2 += t2
-    print("Triton time {} Torch time {}".format(st1, st2))
-
-
-if __name__ == "__main__":
-    # test_correct_int4_s1()
-    # test_correct_int4_s2()
-    # test_correct_int4_gptq()
-    benchmark.run(show_plots=True, print_data=True)
-    exit()
-    bs = 32
-    hidden = 4096
-    inter = 11008
-    prefill_len = 512
-    decode_len = 1
-    tp = 1
-    test_model_layer(bs, prefill_len, hidden, inter, tp)
-    test_model_layer(bs, decode_len, hidden, inter, tp)
diff --git a/lightllm/common/basemodel/triton_kernel/dequantize_gemm_int8.py b/lightllm/common/basemodel/triton_kernel/dequantize_gemm_int8.py
deleted file mode 100644
index e2c5c0dc9..000000000
--- a/lightllm/common/basemodel/triton_kernel/dequantize_gemm_int8.py
+++ /dev/null
@@ -1,209 +0,0 @@
-import torch
-
-import triton
-import triton.language as tl
-
-
-@triton.autotune(
-    configs=[
-        triton.Config({'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 128}, num_stages=3, num_warps=4),
-        triton.Config({'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 256}, num_stages=3, num_warps=8),
-        triton.Config({'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_K': 256}, num_stages=4, num_warps=4),
-        triton.Config({'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_K': 128}, num_stages=4, num_warps=4),
-        triton.Config({'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_K': 64}, num_stages=4, num_warps=4),
-        triton.Config({'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_K': 128}, num_stages=4, num_warps=4),
-        triton.Config({'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_K': 32}, num_stages=4, num_warps=4),
-        triton.Config({'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_K': 32}, num_stages=5, num_warps=2),
-        triton.Config({'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_K': 64}, num_stages=5, num_warps=2),
-        triton.Config({'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64}, num_stages=3, num_warps=8),
-        triton.Config({'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 32}, num_stages=4, num_warps=4),
-        triton.Config({'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32}, num_stages=4, num_warps=4),
-        triton.Config({'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32}, num_stages=4, num_warps=4),
-        triton.Config({'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32}, num_stages=4, num_warps=4),
-        triton.Config({'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_K': 32}, num_stages=4, num_warps=4),
-        triton.Config({'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_K': 32}, num_stages=5, num_warps=2),
-        triton.Config({'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32}, num_stages=5, num_warps=2),
-    ],
-    key=['K', 'N'],
-)
-
-
-@triton.jit
-def dequantize_kernel(
-    # Pointers to matrices
-    b_ptr, b_scale_ptr, fpb_ptr,
-    # Matrix dimensions
-    K, N,
-    stride_bk, stride_bn,
-    stride_fpbk, stride_fpbn,
-    # Meta-parameters
-    BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
-):
-    """Kernel for computing the matmul C = A x B.
-    A has shape (M, K), B has shape (K, N) and C has shape (M, N)
-    """
-    k_block_idx = tl.program_id(axis=0)
-    n_block_idx = tl.program_id(axis=1)
-    offs_k = tl.arange(0, BLOCK_SIZE_K)
-    offs_n = tl.arange(0, BLOCK_SIZE_N)
-    b_offs = (k_block_idx * BLOCK_SIZE_K + offs_k[:, None]) * stride_bk + \
-        (n_block_idx * BLOCK_SIZE_N + offs_n[None, :]) * stride_bn
-    fpb_offs = (k_block_idx * BLOCK_SIZE_K + offs_k[:, None]) * stride_fpbk + \
-        (n_block_idx * BLOCK_SIZE_N + offs_n[None, :]) * stride_fpbn
-    bs_offs = n_block_idx * BLOCK_SIZE_N + offs_n[None, :]
-    n_mask = n_block_idx * BLOCK_SIZE_N + offs_n[None, :] < N
-    mask = (k_block_idx * BLOCK_SIZE_K + offs_k[:, None] < K) & n_mask
-    int_b = tl.load(b_ptr + b_offs, mask=mask, other=0.0)
-    scale_b = tl.load(b_scale_ptr + bs_offs, mask=n_mask, other=0.0)
-    tl.store(fpb_ptr + fpb_offs, int_b * scale_b, mask=mask)
-
-
-def matmul_dequantize_int8(a, b, b_scale, out=None):
-    # Check constraints.
-    assert a.shape[1] == b.shape[0], "Incompatible dimensions"
-    assert a.is_contiguous(), "Matrix A must be contiguous"
-    # assert b.is_contiguous(), "Matrix B must be contiguous"
-    M, K = a.shape
-    K, N = b.shape
-    if out == None:
-        # Allocates output.
-        c = torch.empty((M, N), device=a.device, dtype=a.dtype)
-    else:
-        c = out
-    fp_b = torch.empty((K, N), device=a.device, dtype=a.dtype)
-    grid = lambda META: (
-        triton.cdiv(K, META['BLOCK_SIZE_K']), triton.cdiv(N, META['BLOCK_SIZE_N']),
-    )
-    dequantize_kernel[grid](
-        b, b_scale, fp_b,
-        K, N,
-        b.stride(0), b.stride(1),
-        fp_b.stride(0), fp_b.stride(1)
-    )
-    torch.mm(a, fp_b, out=c)
-    return c
-
-
-def quantize_int8(weight, axis=0, tp_rank=0):
-    # Weight shape: [H1, H2]
-    # Scale shape: [H2]
-    scale = weight.abs().amax(axis, keepdim=True) / 127.
-    weight = (weight / scale).to(torch.int8)
-    if axis == 0:
-        weight = weight.t().contiguous().t()
-    scale = scale.squeeze(axis)
-    return weight.contiguous().cuda(tp_rank), scale.contiguous().cuda(tp_rank)
-
-
-def test_int8(M, K, N):
-    import time
-
-    print("M: {} K: {} N: {}".format(M, K, N))
-    torch.manual_seed(0)
-    a = torch.randn((M, K), device='cuda', dtype=torch.float16)
-    b = torch.randn((K, N), device='cuda', dtype=torch.float16)
-    int_b, b_scale = quantize_int8(b)
-    for _ in range(10):
-        triton_output = matmul_dequantize_int8(a, int_b, b_scale.unsqueeze(0))
-    torch.cuda.synchronize()
-    iters = 512
-    t1 = time.time()
-    for _ in range(iters):
-        triton_output = matmul_dequantize_int8(a, int_b, b_scale.unsqueeze(0))
-    torch.cuda.synchronize()
-    t2 = time.time()
-    triton_time = t2 - t1
-    print("Triton time cost", (t2 - t1))
-    for _ in range(10):
-        torch_output = torch.matmul(a, b)
-    torch.cuda.synchronize()
-    iters = 512
-    t1 = time.time()
-    for _ in range(iters):
-        torch_output = torch.matmul(a, b)
-    torch.cuda.synchronize()
-    t2 = time.time()
-    torch_time = t2 - t1
-    print("Torch time cost", (t2 - t1))
-    return triton_time, torch_time
-
-
-def test_correct_int8(M=512, K=4096, N=4096):
-    import time
-
-    a = torch.randn((M, K), device='cuda', dtype=torch.float16)
-    b = torch.randn((K, N), device='cuda', dtype=torch.float16)
-    int_b, b_scale = quantize_int8(b)
-    cos = torch.nn.CosineSimilarity(0)
-    triton_output = matmul_dequantize_int8(a, int_b, b_scale)
-    torch_output = torch.matmul(a, b)
-    print(f"triton_output={triton_output}")        
-    print(f"torch_output={torch_output}")
-    print("Output cos ", cos(triton_output.flatten().to(torch.float32), torch_output.flatten().to(torch.float32)))
-
-
-@triton.testing.perf_report(
-    triton.testing.Benchmark(
-        x_names=['M', 'N', 'K'],  # Argument names to use as an x-axis for the plot
-        x_vals=[32, 64, 128, 256] + [
-            512 * i for i in range(1, 33)
-        ],  # Different possible values for `x_name`
-        line_arg='provider',  # Argument name whose value corresponds to a different line in the plot
-        # Possible values for `line_arg`
-        line_vals=['cublas', 'triton'],
-        # Label name for the lines
-        line_names=["cuBLAS", "Triton"],
-        # Line styles
-        styles=[('green', '-'), ('blue', '-')],
-        ylabel="TFLOPS",  # Label name for the y-axis
-        plot_name="matmul-performance",  # Name for the plot, used also as a file name for saving the plot.
-        args={},
-    )
-)
-
-
-def benchmark(M, N, K, provider):
-    quantiles = [0.5, 0.2, 0.8]
-    if provider == 'cublas':
-        a = torch.randn((M, K), device='cuda', dtype=torch.float16)
-        b = torch.randn((K, N), device='cuda', dtype=torch.float16)
-        ms, min_ms, max_ms = triton.testing.do_bench(lambda: torch.matmul(a, b), quantiles=quantiles)
-    if provider == 'triton':
-        a = torch.randn((M, K), device='cuda', dtype=torch.float16)
-        b = torch.randn((K, N), device='cuda', dtype=torch.float16)
-        intb, b_scale = quantize_int8(b)
-        ms, min_ms, max_ms = triton.testing.do_bench(lambda: matmul_dequantize_int8(a, intb, b_scale), quantiles=quantiles)
-    perf = lambda ms: 2 * M * N * K * 1e-12 / (ms * 1e-3)
-    return perf(ms), perf(min_ms), perf(max_ms)
-
-
-def test_model_layer(bs, sqe_len, hidden, inter, tp):
-    st1 = 0
-    st2 = 0
-    t1, t2 = test_int8(bs * sqe_len, hidden, hidden * 3 // tp)
-    st1 += t1
-    st2 += t2
-    t1, t2 = test_int8(bs * sqe_len, hidden // tp, hidden)
-    st1 += t1
-    st2 += t2
-    t1, t2 = test_int8(bs * sqe_len, hidden, inter * 2 // tp)
-    st1 += t1
-    st2 += t2
-    t1, t2 = test_int8(bs * sqe_len, inter // tp, hidden)
-    st1 += t1
-    st2 += t2
-    print("Triton time {} Torch time {}".format(st1, st2))
-
-
-if __name__ == "__main__":
-    test_correct_int8()
-    benchmark.run(show_plots=True, print_data=True)
-
-    bs = 32
-    hidden = 4096
-    inter  = 11008
-    prefill_len = 512
-    decode_len = 1
-    tp = 1
-    test_model_layer(bs, prefill_len, hidden, inter, tp)
-    test_model_layer(bs, decode_len, hidden, inter, tp)
\ No newline at end of file
diff --git a/lightllm/common/basemodel/triton_kernel/layernorm.py b/lightllm/common/basemodel/triton_kernel/norm/layernorm.py
similarity index 100%
rename from lightllm/common/basemodel/triton_kernel/layernorm.py
rename to lightllm/common/basemodel/triton_kernel/norm/layernorm.py
diff --git a/lightllm/common/basemodel/triton_kernel/qk_norm.py b/lightllm/common/basemodel/triton_kernel/norm/qk_norm.py
similarity index 100%
rename from lightllm/common/basemodel/triton_kernel/qk_norm.py
rename to lightllm/common/basemodel/triton_kernel/norm/qk_norm.py
diff --git a/lightllm/common/basemodel/triton_kernel/rmsnorm.py b/lightllm/common/basemodel/triton_kernel/norm/rmsnorm.py
similarity index 100%
rename from lightllm/common/basemodel/triton_kernel/rmsnorm.py
rename to lightllm/common/basemodel/triton_kernel/norm/rmsnorm.py
diff --git a/lightllm/common/quantization/triton_quant/__init__.py b/lightllm/common/basemodel/triton_kernel/quantization/__init__.py
similarity index 100%
rename from lightllm/common/quantization/triton_quant/__init__.py
rename to lightllm/common/basemodel/triton_kernel/quantization/__init__.py
diff --git a/lightllm/common/basemodel/triton_kernel/bmm_scaled_fp8.py b/lightllm/common/basemodel/triton_kernel/quantization/bmm_scaled_fp8.py
similarity index 100%
rename from lightllm/common/basemodel/triton_kernel/bmm_scaled_fp8.py
rename to lightllm/common/basemodel/triton_kernel/quantization/bmm_scaled_fp8.py
diff --git a/lightllm/common/quantization/triton_quant/fp8/fp8act_quant_kernel.py b/lightllm/common/basemodel/triton_kernel/quantization/fp8act_quant_kernel.py
similarity index 100%
rename from lightllm/common/quantization/triton_quant/fp8/fp8act_quant_kernel.py
rename to lightllm/common/basemodel/triton_kernel/quantization/fp8act_quant_kernel.py
diff --git a/lightllm/common/quantization/triton_quant/fp8/fp8w8a8_block_gemm_kernel.py b/lightllm/common/basemodel/triton_kernel/quantization/fp8w8a8_block_gemm_kernel.py
similarity index 100%
rename from lightllm/common/quantization/triton_quant/fp8/fp8w8a8_block_gemm_kernel.py
rename to lightllm/common/basemodel/triton_kernel/quantization/fp8w8a8_block_gemm_kernel.py
diff --git a/lightllm/common/quantization/triton_quant/fp8/fp8w8a8_block_quant_kernel.py b/lightllm/common/basemodel/triton_kernel/quantization/fp8w8a8_block_quant_kernel.py
similarity index 100%
rename from lightllm/common/quantization/triton_quant/fp8/fp8w8a8_block_quant_kernel.py
rename to lightllm/common/basemodel/triton_kernel/quantization/fp8w8a8_block_quant_kernel.py
diff --git a/lightllm/common/quantization/triton_quant/fp8/fp8w8a8_scaled_mm_per_token_kernel.py b/lightllm/common/basemodel/triton_kernel/quantization/fp8w8a8_scaled_mm_per_token_kernel.py
similarity index 100%
rename from lightllm/common/quantization/triton_quant/fp8/fp8w8a8_scaled_mm_per_token_kernel.py
rename to lightllm/common/basemodel/triton_kernel/quantization/fp8w8a8_scaled_mm_per_token_kernel.py
diff --git a/lightllm/common/basemodel/triton_kernel/q_per_head_fp8_quant.py b/lightllm/common/basemodel/triton_kernel/quantization/q_per_head_fp8_quant.py
similarity index 100%
rename from lightllm/common/basemodel/triton_kernel/q_per_head_fp8_quant.py
rename to lightllm/common/basemodel/triton_kernel/quantization/q_per_head_fp8_quant.py
diff --git a/lightllm/common/basemodel/triton_kernel/quantize_gemm_int8.py b/lightllm/common/basemodel/triton_kernel/quantize_gemm_int8.py
deleted file mode 100644
index 4f3f6a385..000000000
--- a/lightllm/common/basemodel/triton_kernel/quantize_gemm_int8.py
+++ /dev/null
@@ -1,376 +0,0 @@
-import time
-import torch
-
-import triton
-import triton.language as tl
-
-
-@triton.autotune(
-    configs=[
-        triton.Config({}, num_stages=2, num_warps=8),
-        triton.Config({}, num_stages=2, num_warps=4),
-        triton.Config({}, num_stages=2, num_warps=2),
-        triton.Config({}, num_stages=2, num_warps=1),
-     ],
-    key=['K'],
-)
-@triton.jit
-def quantize_int8_perrow_kernel(
-    fpa_ptr, a_ptr, as_ptr,
-    M, K, 
-    stride_fpam, stride_fpak,
-    stride_am, stride_ak,
-    stride_asm,
-    # Meta-parameters
-    BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
-):
-    pid_m = tl.program_id(axis=0)
-    offs_k = tl.arange(0, BLOCK_SIZE_K)
-    offs_am = (pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M
-
-    fpa_ptrs = fpa_ptr + offs_am[:, None] * stride_fpam + offs_k[None, :] * stride_fpak
-    a_ptrs = a_ptr + offs_am[:, None] * stride_am + offs_k[None, :] * stride_ak
-    a_max = tl.zeros((BLOCK_SIZE_M,), dtype=tl.float32)
-    for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
-        fpa = tl.load(fpa_ptrs, mask=offs_k[None, :] < K - k * BLOCK_SIZE_K, other=0.0)
-        a_max = tl.maximum(a_max, tl.max(tl.abs(fpa), axis=1))
-        fpa_ptrs += BLOCK_SIZE_K * stride_fpak
-    a_scale = (a_max / 127.)
-    fpa_ptrs = fpa_ptr + offs_am[:, None] * stride_fpam + offs_k[None, :] * stride_fpak
-    for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
-        fpa = tl.load(fpa_ptrs, mask=offs_k[None, :] < K - k * BLOCK_SIZE_K, other=0.0)
-        inta = (fpa / a_scale[:, None]).to(tl.int8)
-        tl.store(a_ptrs, inta, mask=offs_k[None, :] < K - k * BLOCK_SIZE_K)
-        fpa_ptrs += BLOCK_SIZE_K * stride_fpak
-        a_ptrs += BLOCK_SIZE_K * stride_ak
-    as_offs = pid_m * BLOCK_SIZE_M * stride_asm + tl.arange(0, BLOCK_SIZE_M)
-    tl.store(as_ptr + as_offs, a_scale)
-
-
-def quantize_int8_perrow(fpa):
-    a = torch.empty(fpa.shape, device=fpa.device, dtype=torch.int8)
-    a_scale = torch.empty(fpa.shape[0], device=fpa.device, dtype=fpa.dtype)
-    M, K = fpa.shape
-    BLOCK_SIZE_M = 1
-    BLOCK_SIZE_K = triton.next_power_of_2(K)
-    grid = (M // BLOCK_SIZE_M,)
-    quantize_int8_perrow_kernel[grid](
-        fpa, a, a_scale,
-        M, K,
-        fpa.stride(0), fpa.stride(1),
-        a.stride(0), a.stride(1),
-        a_scale.stride(0),
-        BLOCK_SIZE_M, BLOCK_SIZE_K,
-    )
-    return a, a_scale
-
-
-@triton.autotune(
-    configs=[
-        triton.Config({'SPLIT_K': 1, 'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8}, num_stages=3, num_warps=8),
-        triton.Config({'SPLIT_K': 1, 'BLOCK_SIZE_M': 64,  'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),
-        triton.Config({'SPLIT_K': 1, 'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),
-        triton.Config({'SPLIT_K': 1, 'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 64,  'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),
-        triton.Config({'SPLIT_K': 1, 'BLOCK_SIZE_M': 64,  'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),
-        triton.Config({'SPLIT_K': 1, 'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 32,  'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),
-        triton.Config({'SPLIT_K': 1, 'BLOCK_SIZE_M': 64,  'BLOCK_SIZE_N': 32,  'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=5, num_warps=2),
-        triton.Config({'SPLIT_K': 1, 'BLOCK_SIZE_M': 32,  'BLOCK_SIZE_N': 64,  'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=5, num_warps=2),
-	    triton.Config({'SPLIT_K': 1, 'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32,  'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),
-	    triton.Config({'SPLIT_K': 1, 'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 64,  'GROUP_SIZE_M': 8}, num_stages=3, num_warps=8),
-	    triton.Config({'SPLIT_K': 1, 'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_K': 128, 'GROUP_SIZE_M': 8}, num_stages=2, num_warps=4),
-	    triton.Config({'SPLIT_K': 1, 'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32,  'GROUP_SIZE_M': 16}, num_stages=4, num_warps=4),
-	    triton.Config({'SPLIT_K': 1, 'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 64,  'GROUP_SIZE_M': 16}, num_stages=3, num_warps=8),
-	    triton.Config({'SPLIT_K': 1, 'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_K': 128, 'GROUP_SIZE_M': 16}, num_stages=2, num_warps=4),
-	    triton.Config({'SPLIT_K': 1, 'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 64,  'GROUP_SIZE_M': 16}, num_stages=4, num_warps=4),
-	    triton.Config({'SPLIT_K': 1, 'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 128, 'GROUP_SIZE_M': 16}, num_stages=3, num_warps=8),
-	    triton.Config({'SPLIT_K': 1, 'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_K': 256, 'GROUP_SIZE_M': 16}, num_stages=2, num_warps=4),
-        triton.Config({'SPLIT_K': 2, 'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8}, num_stages=3, num_warps=8),
-        triton.Config({'SPLIT_K': 2, 'BLOCK_SIZE_M': 64,  'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),
-        triton.Config({'SPLIT_K': 2, 'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),
-        triton.Config({'SPLIT_K': 2, 'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 64,  'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),
-        triton.Config({'SPLIT_K': 2, 'BLOCK_SIZE_M': 64,  'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),
-        triton.Config({'SPLIT_K': 2, 'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 32,  'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),
-        triton.Config({'SPLIT_K': 2, 'BLOCK_SIZE_M': 64,  'BLOCK_SIZE_N': 32,  'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=5, num_warps=2),
-        triton.Config({'SPLIT_K': 2, 'BLOCK_SIZE_M': 32,  'BLOCK_SIZE_N': 64,  'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=5, num_warps=2),
-        triton.Config({'SPLIT_K': 2, 'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32,  'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),
-		triton.Config({'SPLIT_K': 2, 'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 64,  'GROUP_SIZE_M': 8}, num_stages=3, num_warps=8),
-        triton.Config({'SPLIT_K': 2, 'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_K': 128, 'GROUP_SIZE_M': 8}, num_stages=2, num_warps=4),
-		triton.Config({'SPLIT_K': 2, 'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32,  'GROUP_SIZE_M': 16}, num_stages=4, num_warps=4),
-		triton.Config({'SPLIT_K': 2, 'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 64,  'GROUP_SIZE_M': 16}, num_stages=3, num_warps=8),
-		triton.Config({'SPLIT_K': 2, 'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_K': 128, 'GROUP_SIZE_M': 16}, num_stages=2, num_warps=4),
-		triton.Config({'SPLIT_K': 2, 'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 64,  'GROUP_SIZE_M': 16}, num_stages=4, num_warps=4),
-		triton.Config({'SPLIT_K': 2, 'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 128, 'GROUP_SIZE_M': 16}, num_stages=3, num_warps=8),
-		triton.Config({'SPLIT_K': 2, 'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_K': 256, 'GROUP_SIZE_M': 16}, num_stages=2, num_warps=4),
-    ],
-    key=['M', 'N', 'K'],
-    reset_to_zero=['c_ptr']
-)
-@triton.jit
-def matmul_kernel(
-    # Pointers to matrices
-    a_ptr, as_ptr, b_ptr, bs_ptr, c_ptr,
-    # Matrix dimensions
-    M, N, K,
-    # The stride variables represent how much to increase the ptr by when moving by 1
-    # element in a particular dimension. E.g. `stride_am` is how much to increase `a_ptr`
-    # by to get the element one row down (A has M rows).
-    stride_am, stride_ak,
-    stride_asm,
-    stride_bk, stride_bn,
-    stride_bsn,
-    stride_cm, stride_cn,
-    # Meta-parameters
-    BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
-    GROUP_SIZE_M: tl.constexpr, SPLIT_K: tl.constexpr, 
-):
-    """Kernel for computing the matmul C = A x B.
-    A has shape (M, K), B has shape (K, N) and C has shape (M, N)
-    """
-    # -----------------------------------------------------------
-    # Map program ids `pid` to the block of C it should compute.
-    # This is done in a grouped ordering to promote L2 data reuse.
-    # See above `L2 Cache Optimizations` section for details.
-    pid = tl.program_id(axis=0)
-    pid_sp_k = tl.program_id(axis=1)
-    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
-    num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
-    num_pid_in_group = GROUP_SIZE_M * num_pid_n
-    group_id = pid // num_pid_in_group
-    first_pid_m = group_id * GROUP_SIZE_M
-    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
-    pid_m = first_pid_m + (pid % group_size_m)
-    pid_n = (pid % num_pid_in_group) // group_size_m
-
-    # ----------------------------------------------------------
-    # Create pointers for the first blocks of A and B.
-    # We will advance this pointer as we move in the K direction
-    # and accumulate
-    # `a_ptrs` is a block of [BLOCK_SIZE_M, BLOCK_SIZE_K] pointers
-    # `b_ptrs` is a block of [BLOCK_SIZE_K, BLOCK_SIZE_N] pointers
-    # See above `Pointer Arithmetics` section for details
-    offs_am = (pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M
-    offs_bn = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)) % N
-    offs_k = pid_sp_k * BLOCK_SIZE_K + tl.arange(0, BLOCK_SIZE_K)
-    a_ptrs = a_ptr + (offs_am[:, None] * stride_am + offs_k[None, :] * stride_ak)
-    b_ptrs = b_ptr + (offs_k[:, None] * stride_bk + offs_bn[None, :] * stride_bn)
-    as_ptrs = as_ptr + offs_am * stride_asm
-    bs_ptrs = bs_ptr + offs_bn * stride_bsn
-    a_scale = tl.load(as_ptrs, mask=offs_am < M, other=0.0)
-    b_scale = tl.load(bs_ptrs, mask=offs_bn < N, other=0.0)
-    # -----------------------------------------------------------
-    # Iterate to compute a block of the C matrix.
-    # We accumulate into a `[BLOCK_SIZE_M, BLOCK_SIZE_N]` block
-    # of fp32 values for higher accuracy.
-    # `accumulator` will be converted back to fp16 after the loop.
-    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.int32)
-    for k in range(0, tl.cdiv(K, BLOCK_SIZE_K * SPLIT_K)):
-        # Load the next block of A and B, generate a mask by checking the K dimension.
-        # If it is out of bounds, set it to 0.
-        a = tl.load(a_ptrs, mask=offs_k[None, :] < K - k * BLOCK_SIZE_K * SPLIT_K, other=0.0)
-        b = tl.load(b_ptrs, mask=offs_k[:, None] < K - k * BLOCK_SIZE_K * SPLIT_K, other=0.0)
-        # We accumulate along the K dimension.
-        accumulator += tl.dot(a, b)
-        # Advance the ptrs to the next K block.
-        a_ptrs += BLOCK_SIZE_K * SPLIT_K * stride_ak
-        b_ptrs += BLOCK_SIZE_K * SPLIT_K * stride_bk
-    # You can fuse arbitrary activation functions here
-    # while the accumulator is still in FP32!
-    c = (accumulator.to(tl.float32) * a_scale[:, None] * b_scale[None, :]).to(c_ptr.dtype.element_ty)
-    # -----------------------------------------------------------
-    # Write back the block of the output matrix C with masks.
-    offs_cm = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    c_ptrs = c_ptr + stride_cm * offs_cm[:, None] + stride_cn * offs_cn[None, :]
-    c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < N)
-    if SPLIT_K == 1:
-        tl.store(c_ptrs, c, mask=c_mask)
-    else:
-        tl.atomic_add(c_ptrs, c, mask=c_mask)
-
-
-def matmul_quantize_int8(fpa, b, b_scale, out=None):
-    a, a_scale = quantize_int8_perrow(fpa)
-    # a, a_scale = quantize_int8(fpa, axis=1)
-    return matmul_int8(a, a_scale, b, b_scale, out)
-
-
-def matmul_int8(a, a_scale, b, b_scale, out=None):
-    # Check constraints.
-    assert a.shape[1] == b.shape[0], "Incompatible dimensions"
-    M, K = a.shape
-    K, N = b.shape
-    # Allocates output.
-    if out == None:
-        c = torch.zeros((M, N), device=a.device, dtype=torch.float16)
-    else:
-        c = out.fill_(0.)
-    grid = lambda META: (
-        triton.cdiv(M, META['BLOCK_SIZE_M']) * triton.cdiv(N, META['BLOCK_SIZE_N']),
-        META['SPLIT_K'],
-    )
-    matmul_kernel[grid](
-        a, a_scale, b, b_scale, c,
-        M, N, K,
-        a.stride(0), a.stride(1),
-        a_scale.stride(0),
-        b.stride(0), b.stride(1),
-        b_scale.stride(0),
-        c.stride(0), c.stride(1),
-    )
-    return c
-
-
-def quantize_int8(weight, axis=0, tp_rank=0):
-    # Weight shape: [H1, H2]
-    # Scale shape: [H2]
-    scale = weight.abs().amax(axis, keepdim=True) / 127.
-    weight = (weight / scale).to(torch.int8)
-    # col major will accelerate i8xi8 kernel.
-    if axis == 0:
-        weight = weight.t().contiguous().t()
-    scale = scale.squeeze(axis)
-    return weight.contiguous().cuda(tp_rank), scale.contiguous().cuda(tp_rank)
-
-
-def test_correct_int8(M=32, N=4096, K=4096):
-    a = torch.randn((M, K), device='cuda', dtype=torch.float16)
-    b = torch.randn((K, N), device='cuda', dtype=torch.float16)
-    int_a, scale_a = quantize_int8_perrow(a)
-    cos = torch.nn.CosineSimilarity(0)
-    print("Quantization cos", cos((int_a * scale_a.unsqueeze(1)).flatten().to(torch.float32), a.flatten().to(torch.float32)))
-    int_b, scale_b = quantize_int8(b, axis=0)
-    triton_output = matmul_int8(int_a, scale_a, int_b, scale_b)
-    torch_output = torch.matmul(a, b)
-    print(f"triton_output={triton_output}")
-    print(f"torch_output={torch_output}")
-    cos = torch.nn.CosineSimilarity(0)
-    print("Output cos", cos(triton_output.flatten().to(torch.float32), torch_output.flatten().to(torch.float32)))
-
-
-def test_int8(M, K, N):
-    import time
-
-    print("M: {} K: {} N: {}".format(M, K, N))
-    torch.manual_seed(0)
-    a = torch.randn((M, K), device='cuda', dtype=torch.float16)
-    b = torch.randn((K, N), device='cuda', dtype=torch.float16).contiguous()
-    int_b, scale_b = quantize_int8(b, axis=0)
-    for _ in range(10):
-        # int_a, a_scale = quantize_int8(a, 1)
-        int_a, a_scale = quantize_int8_perrow(a)
-        triton_output = matmul_int8(int_a, a_scale, int_b, scale_b)
-    torch.cuda.synchronize()
-    iters = 512
-    t1 = time.time()
-    for _ in range(iters):
-        #int_a, a_scale, _ = quantize_int8(a, 1)
-        int_a, a_scale = quantize_int8_perrow(a)
-    torch.cuda.synchronize()
-    qt2 = time.time()
-    for _ in range(iters):
-        triton_output = matmul_int8(int_a, a_scale, int_b, scale_b)
-    torch.cuda.synchronize()
-    t2 = time.time()
-    quant_time = qt2 - t1
-    triton_time = t2 - qt2
-    triton_tflops = 2 * M * N * K * 1e-12 / (triton_time / iters)
-    quant_bandwith = 2 * M * K * 1e-9 / (quant_time / iters)
-    print("Triton time cost: {} (tflops {}) + quant: {} (bandwidth {})".format(
-        triton_time, triton_tflops, quant_time, quant_bandwith))
-    for _ in range(10):
-        torch_output = torch.matmul(a, b)
-    torch.cuda.synchronize()
-    iters = 512
-    t1 = time.time()
-    for _ in range(iters):
-        torch_output = torch.matmul(a, b)
-    torch.cuda.synchronize()
-    t2 = time.time()
-    torch_time = t2 - t1
-    torch_tflops = 2 * M * N * K * 1e-12 / (torch_time / iters)
-    print("Torch time cost: {} (tflops {})".format(t2 - t1, torch_tflops))
-    return triton_time, torch_time, quant_time
-
-
-@triton.testing.perf_report(
-    triton.testing.Benchmark(
-        x_names=['M'],  # Argument names to use as an x-axis for the plot
-        x_vals=[32, 64, 128, 256] + [
-            512 * i * 2 for i in range(1, 17)
-        ],  # Different possible values for `x_name`
-        line_arg='provider',  # Argument name whose value corresponds to a different line in the plot
-        # Possible values for `line_arg`
-        line_vals=['cublas', 'triton-i8', 'triton-quant-i8', 'quant-perrow'],
-        # Label name for the lines
-        line_names=["cuBLAS", "Triton-i8", "Triton-Quant-i8", "Quant-perrow(GB/s)"],
-        # Line styles
-        styles=[('green', '-'), ('blue', '-'), ('red', '-'), ('purple', '-')],
-        ylabel="TFLOPS",  # Label name for the y-axis
-        plot_name="matmul-performance",  # Name for the plot, used also as a file name for saving the plot.
-        args={},
-    )
-)
-def benchmark(M, provider):
-    K = 10240
-    N = 27392 * 2 // 8
-    quantiles = [0.5, 0.2, 0.8]
-    if provider == 'cublas':
-        a = torch.randn((M, K), device='cuda', dtype=torch.float16)
-        b = torch.randn((K, N), device='cuda', dtype=torch.float16)
-        ms, min_ms, max_ms = triton.testing.do_bench(lambda: torch.matmul(a, b), quantiles=quantiles)
-        perf = lambda ms: 2 * M * N * K * 1e-12 / (ms * 1e-3)
-    if provider == 'triton-i8':
-        a = torch.randn((M, K), device='cuda', dtype=torch.float16).to(torch.int8).contiguous()
-        b = torch.randn((K, N), device='cuda', dtype=torch.float16).to(torch.int8).contiguous()
-        int_a, a_scale = quantize_int8(a, axis=1)
-        int_b, b_scale = quantize_int8(b, axis=0)
-        ms, min_ms, max_ms = triton.testing.do_bench(lambda: matmul_int8(int_a, a_scale, int_b, b_scale), quantiles=quantiles)
-        perf = lambda ms: 2 * M * N * K * 1e-12 / (ms * 1e-3)
-    if provider == 'triton-quant-i8':
-        a = torch.randn((M, K), device='cuda', dtype=torch.float16).to(torch.int8).contiguous()
-        b = torch.randn((K, N), device='cuda', dtype=torch.float16).to(torch.int8).contiguous()
-        int_b, b_scale = quantize_int8(b, axis=0)
-        ms, min_ms, max_ms = triton.testing.do_bench(lambda: matmul_quantize_int8(a, int_b, b_scale), quantiles=quantiles)
-        perf = lambda ms: 2 * M * N * K * 1e-12 / (ms * 1e-3)
-    if provider == 'quant-perrow':
-        a = torch.randn((M, K), device='cuda', dtype=torch.float16).to(torch.int8).contiguous()
-        ms, min_ms, max_ms = triton.testing.do_bench(lambda: quantize_int8_perrow(a), quantiles=quantiles)
-        perf = lambda ms: 2 * M * K * 1e-9 / (ms * 1e-3)
-    return perf(ms), perf(min_ms), perf(max_ms)
-
-
-def test_model_layer(bs, sqe_len, hidden, inter, tp):
-    st1 = 0
-    st2 = 0
-    st3 = 0
-    t1, t2, t3 = test_int8(bs * sqe_len, hidden, hidden * 3 // tp)
-    st1 += t1
-    st2 += t2
-    st3 += t3
-    t1, t2, t3 = test_int8(bs * sqe_len, hidden // tp, hidden)
-    st1 += t1
-    st2 += t2
-    st3 += t3
-    t1, t2, t3 = test_int8(bs * sqe_len, hidden, inter * 2 // tp)
-    st1 += t1
-    st2 += t2
-    st3 += t3
-    t1, t2, t3 = test_int8(bs * sqe_len, inter // tp, hidden)
-    st1 += t1
-    st2 += t2
-    st3 += t3
-    print("Triton time {} Torch time {} Quant time {}".format(st1, st2, st3))
-
-
-if __name__ == "__main__":
-    test_correct_int8()
-    benchmark.run(show_plots=True, print_data=True)
-
-    bs = 32
-    hidden = 4096
-    inter  = 11008
-    prefill_len = 512
-    decode_len = 1
-    tp = 1
-    test_model_layer(bs, prefill_len, hidden, inter, tp)
-    test_model_layer(bs, decode_len, hidden, inter, tp)
diff --git a/lightllm/common/quantization/__init__.py b/lightllm/common/quantization/__init__.py
index d5289298c..8cbcc2e68 100644
--- a/lightllm/common/quantization/__init__.py
+++ b/lightllm/common/quantization/__init__.py
@@ -1,92 +1,13 @@
-import yaml
-import collections
-from .registry import QUANTMETHODS
-from .backend import QUANT_BACKEND
-from lightllm.utils.log_utils import init_logger
-
-# Import all type classes (they auto-register with QUANTMETHODS)
-from .types import (
-    NoQuantization,
-    FP8Block128Quantization,
-    FP8PerTokenQuantization,
-    W8A8Quantization,
-    AWQQuantization,
-)
-
-# Re-export for backwards compatibility
-from .types.awq import is_awq_marlin_compatible
-
-logger = init_logger(__name__)
-
-
-class Quantcfg:
-    def __init__(self, network_config, quant_type="none", custom_cfg_path=None):
-        self.layer_num = network_config["n_layer"]
-        self.quant_type = quant_type
-        self.network_config_ = network_config
-        self._parse_custom_cfg(custom_cfg_path)
-        self._parse_network_config(network_config)
-
-    def _parse_network_config(self, network_config):
-        hf_quantization_config = network_config.get("quantization_config", None)
-        if hf_quantization_config is None:
-            self.quantized_weight = False
-            self.static_activation = False
-            self.hf_quantization_config = None
-            return
-        self.quantized_weight = True
-        activation_scheme = network_config.get("activation_scheme", "dynamic")
-        self.static_activation = activation_scheme == "static"
-        self.hf_quantization_config = hf_quantization_config
-        self.hf_quantization_method = hf_quantization_config["quant_method"]
-        self._mapping_quant_method()
-
-    def _mapping_quant_method(self):
-        if self.hf_quantization_method == "fp8":
-            block_size = self.hf_quantization_config.get("weight_block_size", None)
-            if block_size == [128, 128]:
-                self.quant_type = "fp8-block128"
-                logger.info(
-                    f"Selected quant type: fp8-block128, backend: {QUANT_BACKEND.get_backend('fp8-block128').name}"
-                )
-            else:
-                self.quant_type = "fp8-per-token"
-                logger.info(
-                    f"Selected quant type: fp8-per-token, backend: {QUANT_BACKEND.get_backend('fp8-per-token').name}"
-                )
-        elif self.hf_quantization_method == "awq":
-            self.quant_type = "awq"
-            logger.info("Selected quant type: awq (marlin auto-selected if compatible)")
-        else:
-            # TODO: more quant methods
-            raise NotImplementedError(f"Quant method {self.hf_quantization_method} not implemented yet.")
-            pass
-
-    def _parse_custom_cfg(self, custom_cfg_path):
-        self.quant_cfg = collections.defaultdict(dict)
-        if custom_cfg_path is None:
-            return
-
-        with open(custom_cfg_path, "r") as file:
-            data = yaml.safe_load(file)
-
-        self.quant_type = data["quant_type"]
-        for layer_quant_cfg in data.get("mix_bits", []):
-            name = layer_quant_cfg["name"]
-            layer_nums = layer_quant_cfg.get("layer_nums", range(self.layer_num))
-            layer_quant_type = layer_quant_cfg["quant_type"]
-            for layer_num in layer_nums:
-                self.quant_cfg[layer_num].update({name: layer_quant_type})
-
-    def get_quant_type(self, layer_num, name):
-        layer_config = self.quant_cfg.get(layer_num, None)
-        if layer_config is None:
-            return self.quant_type
-        quant_type = layer_config.get(name, self.quant_type)
-        return quant_type
-
-    def get_quant_method(self, layer_num, name):
-        quant_type = self.get_quant_type(layer_num, name)
-        quant_method = QUANTMETHODS.get(quant_type)
-        quant_method.hf_quantization_config = self.hf_quantization_config
-        return quant_method
+from .no_quant import NoQuantization
+from .fp8_block128 import FP8Block128Quantization
+from .fp8_per_token import FP8PerTokenQuantization
+from .w8a8 import W8A8Quantization
+from .awq import AWQQuantization
+
+__all__ = [
+    "NoQuantization",
+    "FP8Block128Quantization",
+    "FP8PerTokenQuantization",
+    "W8A8Quantization",
+    "AWQQuantization",
+]
diff --git a/lightllm/common/quantization/types/awq.py b/lightllm/common/quantization/awq.py
similarity index 100%
rename from lightllm/common/quantization/types/awq.py
rename to lightllm/common/quantization/awq.py
diff --git a/lightllm/common/quantization/backend.py b/lightllm/common/quantization/backend.py
deleted file mode 100644
index e6d081ec2..000000000
--- a/lightllm/common/quantization/backend.py
+++ /dev/null
@@ -1,82 +0,0 @@
-import os
-from enum import Enum, auto
-from lightllm.utils.log_utils import init_logger
-
-logger = init_logger(__name__)
-
-
-class BackendType(Enum):
-    TRITON = auto()
-    VLLM = auto()
-    DEEPGEMM = auto()
-
-
-class BackendRegistry:
-    _instance = None
-
-    def __new__(cls):
-        if cls._instance is None:
-            cls._instance = super().__new__(cls)
-            cls._instance._initialized = False
-        return cls._instance
-
-    def __init__(self):
-        if self._initialized:
-            return
-        self._initialized = True
-
-        self._force_triton = os.getenv("LIGHTLLM_USE_TRITON_QUANT", "0").upper() in ["1", "TRUE", "ON"]
-
-        self._has_vllm = self._check_vllm()
-        self._has_deepgemm = self._check_deepgemm()
-
-        if self._force_triton:
-            logger.info("LIGHTLLM_USE_TRITON_QUANT is set, forcing Triton backend for quantization")
-        else:
-            logger.info(f"Available quantization backends: vLLM={self._has_vllm}, DeepGEMM={self._has_deepgemm}")
-
-    def _check_vllm(self) -> bool:
-        try:
-            from lightllm.utils.vllm_utils import HAS_VLLM
-
-            return HAS_VLLM
-        except ImportError:
-            return False
-
-    def _check_deepgemm(self) -> bool:
-        try:
-            import deep_gemm  # noqa: F401
-
-            return True
-        except ImportError:
-            return False
-
-    @property
-    def force_triton(self) -> bool:
-        return self._force_triton
-
-    @property
-    def has_vllm(self) -> bool:
-        return self._has_vllm
-
-    @property
-    def has_deepgemm(self) -> bool:
-        return self._has_deepgemm
-
-    def get_backend(self, quant_type: str) -> BackendType:
-        if self._force_triton:
-            return BackendType.TRITON
-
-        if quant_type == "fp8-block128":
-            if self._has_deepgemm:
-                return BackendType.DEEPGEMM
-            elif self._has_vllm:
-                return BackendType.VLLM
-        elif quant_type in ["w8a8", "fp8-per-token"]:
-            if self._has_vllm:
-                return BackendType.VLLM
-
-        return BackendType.TRITON
-
-
-QUANT_BACKEND = BackendRegistry()
diff --git a/lightllm/common/quantization/types/fp8_block128.py b/lightllm/common/quantization/fp8_block128.py
similarity index 100%
rename from lightllm/common/quantization/types/fp8_block128.py
rename to lightllm/common/quantization/fp8_block128.py
diff --git a/lightllm/common/quantization/types/fp8_per_token.py b/lightllm/common/quantization/fp8_per_token.py
similarity index 100%
rename from lightllm/common/quantization/types/fp8_per_token.py
rename to lightllm/common/quantization/fp8_per_token.py
diff --git a/lightllm/common/quantization/types/no_quant.py b/lightllm/common/quantization/no_quant.py
similarity index 100%
rename from lightllm/common/quantization/types/no_quant.py
rename to lightllm/common/quantization/no_quant.py
diff --git a/lightllm/common/quantization/triton_quant/fp8/__init__.py b/lightllm/common/quantization/triton_quant/fp8/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/lightllm/common/quantization/types/__init__.py b/lightllm/common/quantization/types/__init__.py
deleted file mode 100644
index 8cbcc2e68..000000000
--- a/lightllm/common/quantization/types/__init__.py
+++ /dev/null
@@ -1,13 +0,0 @@
-from .no_quant import NoQuantization
-from .fp8_block128 import FP8Block128Quantization
-from .fp8_per_token import FP8PerTokenQuantization
-from .w8a8 import W8A8Quantization
-from .awq import AWQQuantization
-
-__all__ = [
-    "NoQuantization",
-    "FP8Block128Quantization",
-    "FP8PerTokenQuantization",
-    "W8A8Quantization",
-    "AWQQuantization",
-]
diff --git a/lightllm/common/quantization/types/w8a8.py b/lightllm/common/quantization/w8a8.py
similarity index 75%
rename from lightllm/common/quantization/types/w8a8.py
rename to lightllm/common/quantization/w8a8.py
index e3b0ef592..f803794a2 100644
--- a/lightllm/common/quantization/types/w8a8.py
+++ b/lightllm/common/quantization/w8a8.py
@@ -3,7 +3,7 @@
 
 from lightllm.common.quantization.quantize_method import QuantizationMethod, WeightPack
 from lightllm.common.quantization.registry import QUANTMETHODS
-from lightllm.common.quantization.backend import QUANT_BACKEND, BackendType
+from lightllm.common.basemodel.layer_weights.meta_weights.platform_op import PlatformAwareOp
 from lightllm.utils.log_utils import init_logger
 
 logger = init_logger(__name__)
@@ -24,7 +24,7 @@
 
 
 @QUANTMETHODS.register(["w8a8", "vllm-w8a8"])
-class W8A8Quantization(QuantizationMethod):
+class W8A8Quantization(QuantizationMethod, PlatformAwareOp):
     def __init__(self):
         super().__init__()
         from lightllm.common.basemodel.layer_infer.cache_tensor_manager import g_cache_manager
@@ -33,26 +33,18 @@ def __init__(self):
         self.has_weight_scale = True
         self.has_weight_zero_point = False
 
-        self._backend = QUANT_BACKEND.get_backend("w8a8")
-
-        if self._backend == BackendType.TRITON:
-            if not HAS_VLLM:
-                raise NotImplementedError(
-                    "W8A8 Triton fallback is not yet implemented. "
-                    "Please install vLLM or disable LIGHTLLM_USE_TRITON_QUANT."
-                )
-            self._backend = BackendType.VLLM
-            logger.warning("W8A8 Triton fallback not implemented, falling back to vLLM backend")
-
-        if self._backend == BackendType.VLLM and not HAS_VLLM:
-            raise RuntimeError("vLLM is required for W8A8 quantization but is not installed.")
-
-        logger.info(f"W8A8Quantization using backend: {self._backend.name}")
-
     @property
     def method_name(self):
         return "w8a8"
 
+    def create_weight(
+        self, out_dim: int, in_dim: int, dtype: torch.dtype, device_id: int, num_experts: int = 1
+    ) -> WeightPack:
+        expert_prefix = (num_experts,) if num_experts > 1 else ()
+        weight = torch.empty(expert_prefix + (out_dim, in_dim), dtype=torch.int8).cuda(device_id)
+        weight_scale = torch.empty(expert_prefix + (out_dim,), dtype=torch.float32).cuda(device_id)
+        return WeightPack(weight=weight, weight_scale=weight_scale)
+
     def quantize(self, weight: torch.Tensor, output: WeightPack, offset: int = 0) -> None:
         weight = weight.float().cuda(self.device_id_)
         scale = weight.abs().max(dim=-1)[0] / 127
@@ -71,10 +63,9 @@ def apply(
         use_custom_tensor_mananger: bool = True,
         bias: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
-        # TODO: Currently only vLLM backend is implemented
-        return self._apply_vllm(input_tensor, weight_pack, out, use_custom_tensor_mananger, bias)
+        return self._forward(input_tensor, weight_pack, out, use_custom_tensor_mananger, bias)
 
-    def _apply_vllm(
+    def _cuda_forward(
         self,
         input_tensor: torch.Tensor,
         weight_pack: WeightPack,
@@ -98,11 +89,3 @@ def _apply_vllm(
 
         cutlass_scaled_mm(out, x_q, qweight, x_scale, weight_scale, bias)
         return out
-
-    def create_weight(
-        self, out_dim: int, in_dim: int, dtype: torch.dtype, device_id: int, num_experts: int = 1
-    ) -> WeightPack:
-        expert_prefix = (num_experts,) if num_experts > 1 else ()
-        weight = torch.empty(expert_prefix + (out_dim, in_dim), dtype=torch.int8).cuda(device_id)
-        weight_scale = torch.empty(expert_prefix + (out_dim,), dtype=torch.float32).cuda(device_id)
-        return WeightPack(weight=weight, weight_scale=weight_scale)
diff --git a/lightllm/models/deepseek2/layer_weights/transformer_layer_weight.py b/lightllm/models/deepseek2/layer_weights/transformer_layer_weight.py
index 05897203a..d927f22d1 100644
--- a/lightllm/models/deepseek2/layer_weights/transformer_layer_weight.py
+++ b/lightllm/models/deepseek2/layer_weights/transformer_layer_weight.py
@@ -11,7 +11,6 @@
     FusedMoeWeightEP,
     create_tp_moe_wegiht_obj,
 )
-from functools import partial
 from ..triton_kernel.weight_dequant import weight_dequant
 
 
diff --git a/lightllm/server/api_cli.py b/lightllm/server/api_cli.py
index 05da4ccb3..e4b5f8f2b 100644
--- a/lightllm/server/api_cli.py
+++ b/lightllm/server/api_cli.py
@@ -612,8 +612,16 @@ def make_argument_parser() -> argparse.ArgumentParser:
         help="""Hardware platform: cuda | musa""",
     )
     parser.add_argument(
-        "--enable_torch_naive",
+        "--enable_torch_fallback",
         action="store_true",
-        help="""Use torch naive implementation for the op.""",
+        help="""Whether to enable torch naive implementation for the op.
+        If the op is not implemented for the platform, it will use torch naive implementation.""",
+    )
+    parser.add_argument(
+        "--enable_triton_fallback",
+        action="store_true",
+        help="""Whether to enable triton implementation for the op.
+        If the op is not implemented for the platform and the hardware support triton,
+        it will use triton implementation.""",
     )
     return parser

From a63cc8cbf1240dbd85fbbcc8544aad94c7b234ec Mon Sep 17 00:00:00 2001
From: shihaobai <1798930569@qq.com>
Date: Thu, 15 Jan 2026 11:40:09 +0000
Subject: [PATCH 18/65] fix

---
 ...ernel.py => scaled_mm_per_token_kernel.py} |  33 ++--
 lightllm/common/quantization/fp8_per_token.py |   2 +-
 lightllm/common/quantization/w8a8.py          | 170 +++++++++++++++++-
 ...h.bfloat16}_NVIDIA_GeForce_RTX_4090_D.json |   0
 ...h.bfloat16}_NVIDIA_GeForce_RTX_4090_D.json |   0
 ...h.bfloat16}_NVIDIA_GeForce_RTX_4090_D.json |   0
 ...h.bfloat16}_NVIDIA_GeForce_RTX_4090_D.json |   0
 ...h.bfloat16}_NVIDIA_GeForce_RTX_4090_D.json |   0
 ...rch.bfloat16}_NVIDIA_GeForce_RTX_5090.json |   0
 ...rch.bfloat16}_NVIDIA_GeForce_RTX_5090.json |   0
 ...rch.bfloat16}_NVIDIA_GeForce_RTX_5090.json |   0
 ...rch.bfloat16}_NVIDIA_GeForce_RTX_5090.json |   0
 ...rch.bfloat16}_NVIDIA_GeForce_RTX_5090.json |   0
 ...rch.bfloat16}_NVIDIA_GeForce_RTX_5090.json |   0
 ...rch.bfloat16}_NVIDIA_GeForce_RTX_5090.json |   0
 ...rch.bfloat16}_NVIDIA_GeForce_RTX_5090.json |   0
 ...rch.bfloat16}_NVIDIA_GeForce_RTX_5090.json |   0
 ...rch.bfloat16}_NVIDIA_GeForce_RTX_5090.json |   0
 ...rch.bfloat16}_NVIDIA_GeForce_RTX_5090.json |   0
 ...rch.bfloat16}_NVIDIA_GeForce_RTX_5090.json |   0
 20 files changed, 191 insertions(+), 14 deletions(-)
 rename lightllm/common/basemodel/triton_kernel/quantization/{fp8w8a8_scaled_mm_per_token_kernel.py => scaled_mm_per_token_kernel.py} (93%)
 rename lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_GeForce_RTX_4090_D/{fp8_scaled_mm_per_token:v3 => scaled_mm_per_token:v1}/{K=14336,N=5120,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_4090_D.json (100%)
 rename lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_GeForce_RTX_4090_D/{fp8_scaled_mm_per_token:v3 => scaled_mm_per_token:v1}/{K=4096,N=5120,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_4090_D.json (100%)
 rename lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_GeForce_RTX_4090_D/{fp8_scaled_mm_per_token:v3 => scaled_mm_per_token:v1}/{K=5120,N=2048,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_4090_D.json (100%)
 rename lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_GeForce_RTX_4090_D/{fp8_scaled_mm_per_token:v3 => scaled_mm_per_token:v1}/{K=5120,N=28672,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_4090_D.json (100%)
 rename lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_GeForce_RTX_4090_D/{fp8_scaled_mm_per_token:v3 => scaled_mm_per_token:v1}/{K=5120,N=4096,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_4090_D.json (100%)
 rename lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_GeForce_RTX_5090/{fp8_scaled_mm_per_token:v3 => scaled_mm_per_token:v1}/{K=13824,N=5120,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json (100%)
 rename lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_GeForce_RTX_5090/{fp8_scaled_mm_per_token:v3 => scaled_mm_per_token:v1}/{K=14336,N=5120,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json (100%)
 rename lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_GeForce_RTX_5090/{fp8_scaled_mm_per_token:v3 => scaled_mm_per_token:v1}/{K=1536,N=1536,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json (100%)
 rename lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_GeForce_RTX_5090/{fp8_scaled_mm_per_token:v3 => scaled_mm_per_token:v1}/{K=1536,N=8960,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json (100%)
 rename lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_GeForce_RTX_5090/{fp8_scaled_mm_per_token:v3 => scaled_mm_per_token:v1}/{K=4096,N=5120,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json (100%)
 rename lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_GeForce_RTX_5090/{fp8_scaled_mm_per_token:v3 => scaled_mm_per_token:v1}/{K=5120,N=13824,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json (100%)
 rename lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_GeForce_RTX_5090/{fp8_scaled_mm_per_token:v3 => scaled_mm_per_token:v1}/{K=5120,N=2048,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json (100%)
 rename lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_GeForce_RTX_5090/{fp8_scaled_mm_per_token:v3 => scaled_mm_per_token:v1}/{K=5120,N=28672,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json (100%)
 rename lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_GeForce_RTX_5090/{fp8_scaled_mm_per_token:v3 => scaled_mm_per_token:v1}/{K=5120,N=4096,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json (100%)
 rename lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_GeForce_RTX_5090/{fp8_scaled_mm_per_token:v3 => scaled_mm_per_token:v1}/{K=5120,N=5120,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json (100%)
 rename lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_GeForce_RTX_5090/{fp8_scaled_mm_per_token:v3 => scaled_mm_per_token:v1}/{K=8960,N=1536,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json (100%)
 rename lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_GeForce_RTX_5090/{fp8_scaled_mm_per_token:v3 => scaled_mm_per_token:v1}/{N=14336,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json (100%)

diff --git a/lightllm/common/basemodel/triton_kernel/quantization/fp8w8a8_scaled_mm_per_token_kernel.py b/lightllm/common/basemodel/triton_kernel/quantization/scaled_mm_per_token_kernel.py
similarity index 93%
rename from lightllm/common/basemodel/triton_kernel/quantization/fp8w8a8_scaled_mm_per_token_kernel.py
rename to lightllm/common/basemodel/triton_kernel/quantization/scaled_mm_per_token_kernel.py
index 7c76e82c9..f14e8b283 100644
--- a/lightllm/common/basemodel/triton_kernel/quantization/fp8w8a8_scaled_mm_per_token_kernel.py
+++ b/lightllm/common/basemodel/triton_kernel/quantization/scaled_mm_per_token_kernel.py
@@ -11,8 +11,8 @@
 from lightllm.utils.device_utils import triton_support_tensor_descriptor, is_5090_gpu
 
 
-class Fp8ScaledMMKernelConfig(KernelConfigs):
-    kernel_name: str = "fp8_scaled_mm_per_token"
+class ScaledMMKernelConfig(KernelConfigs):
+    kernel_name: str = "scaled_mm_per_token"
 
     @classmethod
     @lru_cache(maxsize=200)
@@ -105,6 +105,7 @@ def _scaled_mm_per_token(
     BLOCK_N: tl.constexpr,
     BLOCK_K: tl.constexpr,
     GROUP_M: tl.constexpr,
+    ACC_DTYPE: tl.constexpr,
 ):
     pid = tl.program_id(0)
     m_block_num = tl.cdiv(M, BLOCK_M)
@@ -134,7 +135,7 @@ def _scaled_mm_per_token(
     a_s = tl.load(Ascale_ptrs)
     b_s = tl.load(Bscale_ptrs)
 
-    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_DTYPE)
 
     for k in range(0, tl.cdiv(K, BLOCK_K)):
         if USE_TMA:
@@ -155,6 +156,7 @@ def _scaled_mm_per_token(
             a_ptrs += BLOCK_K * stride_ak
             b_ptrs += BLOCK_K * stride_bk
 
+    acc = acc.to(tl.float32)
     acc = acc * a_s[:, None] * b_s[None, :]
 
     acc = acc.to(out.dtype.element_ty)
@@ -206,13 +208,13 @@ def _get_static_key(A, B, out_dtype):
 
 
 @autotune(
-    kernel_name="fp8_scaled_mm_per_token:v3",
+    kernel_name="scaled_mm_per_token:v1",
     configs_gen_func=get_test_configs,
     static_key_func=_get_static_key,
     run_key_func=lambda A: A.shape[0],
     mutates_args=["out"],
 )
-def fp8_scaled_mm_per_token(
+def scaled_mm_per_token(
     A: torch.Tensor,
     B: torch.Tensor,
     Ascale: torch.Tensor,
@@ -221,7 +223,7 @@ def fp8_scaled_mm_per_token(
     out: torch.Tensor,
     run_config=None,
 ) -> torch.Tensor:
-    """w8a8fp8 per-token quantization mm.
+    """w8a8 per-token quantization mm (supports fp8 and int8).
 
     Args:
         A: Matrix A with shape of [M, K].
@@ -239,7 +241,7 @@ def fp8_scaled_mm_per_token(
     M, K = A.shape
     _, N = B.shape
     if not run_config:
-        run_config = Fp8ScaledMMKernelConfig.try_to_get_best_config(M=M, N=N, K=K, out_dtype=out_dtype)
+        run_config = ScaledMMKernelConfig.try_to_get_best_config(M=M, N=N, K=K, out_dtype=out_dtype)
     NEED_N_MASK = N % run_config["BLOCK_N"] != 0
     NEED_K_MASK = K % run_config["BLOCK_K"] != 0
     grid = (triton.cdiv(M, run_config["BLOCK_M"]) * triton.cdiv(N, run_config["BLOCK_N"]),)
@@ -283,6 +285,8 @@ def alloc_fn(size: int, alignment: int, stream: Optional[int]):
         B_desc = None
         out_desc = None
 
+    ACC_DTYPE = tl.int32 if A.dtype == torch.int8 else tl.float32
+
     _scaled_mm_per_token[grid](
         A=A,
         A_desc=A_desc,
@@ -305,12 +309,17 @@ def alloc_fn(size: int, alignment: int, stream: Optional[int]):
         B_IS_TRANS=B_is_trans,
         NEED_N_MASK=NEED_N_MASK,
         NEED_K_MASK=NEED_K_MASK,
+        ACC_DTYPE=ACC_DTYPE,
         **run_config,
     )
 
     return out
 
 
+fp8_scaled_mm_per_token = scaled_mm_per_token
+int8_scaled_mm_per_token = scaled_mm_per_token
+
+
 if __name__ == "__main__":
     import time
     import os
@@ -324,7 +333,7 @@ def alloc_fn(size: int, alignment: int, stream: Optional[int]):
     M_list = [1, 2, 4, 8, 16, 32, 48]
 
     print(f"{'='*80}")
-    print(f"Starting Autotune for FP8 Scaled MM (N={N}, K={K})")
+    print(f"Starting Autotune for Scaled MM (N={N}, K={K})")
     print(f"M values to test: {M_list}")
     print(f"Total configs per M: {len(get_test_configs())}")
     print(f"{'='*80}\n")
@@ -360,7 +369,7 @@ def alloc_fn(size: int, alignment: int, stream: Optional[int]):
     gt_C = d_A.mm(d_B)
 
     # 运行kernel验证正确性
-    fp8_scaled_mm_per_token(A_verify, B, Ascale_verify, Bscale, output_dtype, out_verify)
+    scaled_mm_per_token(A_verify, B, Ascale_verify, Bscale, output_dtype, out_verify)
 
     # 计算cosine similarity
     cosine_sim = F.cosine_similarity(out_verify.flatten().unsqueeze(0), gt_C.flatten().unsqueeze(0), dim=1)
@@ -390,7 +399,7 @@ def alloc_fn(size: int, alignment: int, stream: Optional[int]):
         A = test_data[M]["A"]
         Ascale = test_data[M]["Ascale"]
         out = test_data[M]["out"]
-        fp8_scaled_mm_per_token(A, B, Ascale, Bscale, output_dtype, out)
+        scaled_mm_per_token(A, B, Ascale, Bscale, output_dtype, out)
         print(f"[M={M}] Autotune completed!")
 
     Autotuner.end_autotune_warmup()
@@ -418,7 +427,7 @@ def alloc_fn(size: int, alignment: int, stream: Optional[int]):
         gt_C = d_A.mm(d_B)
 
         # 运行一次确保结果正确
-        fp8_scaled_mm_per_token(A, B, Ascale, Bscale, output_dtype, out)
+        scaled_mm_per_token(A, B, Ascale, Bscale, output_dtype, out)
         sgl_res = fp8_scaled_mm(A, B, Ascale, Bscale, output_dtype)
 
         cosine_sim = F.cosine_similarity(out.flatten().unsqueeze(0), gt_C.flatten().unsqueeze(0), dim=1)
@@ -437,7 +446,7 @@ def alloc_fn(size: int, alignment: int, stream: Optional[int]):
         ms_sgl = triton.testing.do_bench(fn_sgl, warmup=25, rep=100)
 
         # Our kernel
-        fn_ours = lambda: fp8_scaled_mm_per_token(A, B, Ascale, Bscale, output_dtype, out)
+        fn_ours = lambda: scaled_mm_per_token(A, B, Ascale, Bscale, output_dtype, out)
         ms_ours = triton.testing.do_bench_cudagraph(fn_ours, rep=100)
 
         print(f"[M={M}] BF16:       {ms_bf16:.3f} ms")
diff --git a/lightllm/common/quantization/fp8_per_token.py b/lightllm/common/quantization/fp8_per_token.py
index c49bc89ff..ce7f9342c 100644
--- a/lightllm/common/quantization/fp8_per_token.py
+++ b/lightllm/common/quantization/fp8_per_token.py
@@ -4,7 +4,7 @@
 from lightllm.common.quantization.quantize_method import QuantizationMethod, WeightPack
 from lightllm.common.quantization.registry import QUANTMETHODS
 from lightllm.common.quantization.backend import QUANT_BACKEND, BackendType
-from lightllm.common.quantization.triton_quant.fp8.fp8w8a8_scaled_mm_per_token_kernel import fp8_scaled_mm_per_token
+from lightllm.common.basemodel.triton_kernel.quantization.scaled_mm_per_token_kernel import fp8_scaled_mm_per_token
 from lightllm.utils.log_utils import init_logger
 
 logger = init_logger(__name__)
diff --git a/lightllm/common/quantization/w8a8.py b/lightllm/common/quantization/w8a8.py
index f803794a2..721807356 100644
--- a/lightllm/common/quantization/w8a8.py
+++ b/lightllm/common/quantization/w8a8.py
@@ -4,6 +4,10 @@
 from lightllm.common.quantization.quantize_method import QuantizationMethod, WeightPack
 from lightllm.common.quantization.registry import QUANTMETHODS
 from lightllm.common.basemodel.layer_weights.meta_weights.platform_op import PlatformAwareOp
+from lightllm.common.basemodel.triton_kernel.quantization.scaled_mm_per_token_kernel import (
+    fp8_scaled_mm_per_token,
+    int8_scaled_mm_per_token,
+)
 from lightllm.utils.log_utils import init_logger
 
 logger = init_logger(__name__)
@@ -23,6 +27,27 @@
     cutlass_scaled_mm = None
 
 
+try:
+    from lightllm.utils.light_utils import HAS_LIGHTLLM_KERNEL, light_ops
+
+    if HAS_LIGHTLLM_KERNEL:
+
+        def scaled_fp8_quant(tensor, *args, **kwargs):
+            return light_ops.per_token_quant_bf16_fp8(tensor)
+
+    else:
+        if HAS_VLLM:
+            scaled_fp8_quant = vllm_ops.scaled_fp8_quant
+        else:
+            scaled_fp8_quant = None
+except ImportError:
+    HAS_LIGHTLLM_KERNEL = False
+    if HAS_VLLM:
+        scaled_fp8_quant = vllm_ops.scaled_fp8_quant
+    else:
+        scaled_fp8_quant = None
+
+
 @QUANTMETHODS.register(["w8a8", "vllm-w8a8"])
 class W8A8Quantization(QuantizationMethod, PlatformAwareOp):
     def __init__(self):
@@ -63,13 +88,53 @@ def apply(
         use_custom_tensor_mananger: bool = True,
         bias: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
-        return self._forward(input_tensor, weight_pack, out, use_custom_tensor_mananger, bias)
+        return self._forward(
+            input_tensor=input_tensor,
+            weight_pack=weight_pack,
+            out=out,
+            workspace=workspace,
+            use_custom_tensor_mananger=use_custom_tensor_mananger,
+            bias=bias,
+        )
+
+    def _triton_forward(
+        self,
+        input_tensor: torch.Tensor,
+        weight_pack: WeightPack,
+        out: Optional[torch.Tensor],
+        workspace: Optional[torch.Tensor],
+        use_custom_tensor_mananger: bool,
+        bias: Optional[torch.Tensor],
+    ) -> torch.Tensor:
+
+        qweight = weight_pack.weight.t()
+        weight_scale = weight_pack.weight_scale
+
+        # TODO: support fp8 quantization triton
+
+        x_q, x_scale = scaled_fp8_quant(input_tensor, scale=None, scale_ub=None, use_per_token_if_dynamic=True)
+
+        m = input_tensor.shape[0]
+        n = qweight.shape[1]
+
+        if out is None:
+            if use_custom_tensor_mananger:
+                out = self.cache_manager.alloc_tensor((m, n), input_tensor.dtype, device=input_tensor.device)
+            else:
+                out = torch.empty((m, n), dtype=input_tensor.dtype, device=input_tensor.device)
+
+        out = int8_scaled_mm_per_token(x_q, qweight, x_scale, weight_scale, input_tensor.dtype, out)
+
+        if bias is not None:
+            out.add_(bias)
+        return out
 
     def _cuda_forward(
         self,
         input_tensor: torch.Tensor,
         weight_pack: WeightPack,
         out: Optional[torch.Tensor],
+        workspace: Optional[torch.Tensor],
         use_custom_tensor_mananger: bool,
         bias: Optional[torch.Tensor],
     ) -> torch.Tensor:
@@ -89,3 +154,106 @@ def _cuda_forward(
 
         cutlass_scaled_mm(out, x_q, qweight, x_scale, weight_scale, bias)
         return out
+
+
+class Fp8W8A8Quantization(QuantizationMethod, PlatformAwareOp):
+    def __init__(self):
+        super().__init__()
+        from lightllm.common.basemodel.layer_infer.cache_tensor_manager import g_cache_manager
+
+        self.cache_manager = g_cache_manager
+        self.is_moe = False
+        self.has_weight_scale = True
+        self.has_weight_zero_point = False
+
+    @property
+    def method_name(self):
+        return "f8w8a8"
+
+    def quantize(self, weight: torch.Tensor, output: WeightPack, offset: int = 0) -> None:
+        """Quantize weights using per-token FP8 quantization."""
+        qweight, weight_scale = scaled_fp8_quant(
+            weight.cuda(self.device_id_), scale=None, use_per_token_if_dynamic=True
+        )
+        output.weight[offset : offset + qweight.shape[0], :].copy_(qweight)
+        output.weight_scale[offset : offset + weight_scale.shape[0]].copy_(weight_scale.view(-1))
+        return
+
+    def create_weight(
+        self, out_dim: int, in_dim: int, dtype: torch.dtype, device_id: int, num_experts: int = 1
+    ) -> WeightPack:
+        expert_prefix = (num_experts,) if num_experts > 1 else ()
+        weight = torch.empty(expert_prefix + (out_dim, in_dim), dtype=torch.float8_e4m3fn).cuda(device_id)
+        if self.is_moe:
+            assert num_experts > 1, "Number of experts must be greater than 1 for MOE"
+            # per-tensor weight quantization for moe
+            weight_scale = torch.empty((num_experts,), dtype=torch.float32).cuda(device_id)
+        else:
+            weight_scale = torch.empty(expert_prefix + (out_dim,), dtype=torch.float32).cuda(device_id)
+        return WeightPack(weight=weight, weight_scale=weight_scale)
+
+    def apply(
+        self,
+        input_tensor: torch.Tensor,
+        weight_pack: WeightPack,
+        out: Optional[torch.Tensor] = None,
+        workspace: Optional[torch.Tensor] = None,
+        use_custom_tensor_mananger: bool = True,
+        bias: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        return self._forward(input_tensor, weight_pack, out, use_custom_tensor_mananger, bias)
+
+    def _cuda_forward(
+        self,
+        input_tensor: torch.Tensor,
+        weight_pack: WeightPack,
+        out: Optional[torch.Tensor],
+        use_custom_tensor_mananger: bool,
+        bias: Optional[torch.Tensor],
+    ) -> torch.Tensor:
+        qweight = weight_pack.weight.t()
+        weight_scale = weight_pack.weight_scale
+
+        x_q, x_scale = scaled_fp8_quant(input_tensor, scale=None, scale_ub=None, use_per_token_if_dynamic=True)
+
+        m = input_tensor.shape[0]
+        n = qweight.shape[1]
+
+        if out is None:
+            if use_custom_tensor_mananger:
+                out = self.cache_manager.alloc_tensor((m, n), input_tensor.dtype, device=input_tensor.device)
+            else:
+                out = torch.empty((m, n), dtype=input_tensor.dtype, device=input_tensor.device)
+
+        cutlass_scaled_mm(out, x_q, qweight, x_scale, weight_scale, bias)
+        return out
+
+    def _apply_triton(
+        self,
+        input_tensor: torch.Tensor,
+        weight_pack: WeightPack,
+        out: Optional[torch.Tensor],
+        use_custom_tensor_mananger: bool,
+        bias: Optional[torch.Tensor],
+    ) -> torch.Tensor:
+        qweight = weight_pack.weight.t()
+        weight_scale = weight_pack.weight_scale
+
+        # TODO: support fp8 quantization triton
+
+        x_q, x_scale = scaled_fp8_quant(input_tensor, scale=None, scale_ub=None, use_per_token_if_dynamic=True)
+
+        m = input_tensor.shape[0]
+        n = qweight.shape[1]
+
+        if out is None:
+            if use_custom_tensor_mananger:
+                out = self.cache_manager.alloc_tensor((m, n), input_tensor.dtype, device=input_tensor.device)
+            else:
+                out = torch.empty((m, n), dtype=input_tensor.dtype, device=input_tensor.device)
+
+        out = fp8_scaled_mm_per_token(x_q, qweight, x_scale, weight_scale, input_tensor.dtype, out)
+
+        if bias is not None:
+            out.add_(bias)
+        return out
diff --git a/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_GeForce_RTX_4090_D/fp8_scaled_mm_per_token:v3/{K=14336,N=5120,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_4090_D.json b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_GeForce_RTX_4090_D/scaled_mm_per_token:v1/{K=14336,N=5120,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_4090_D.json
similarity index 100%
rename from lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_GeForce_RTX_4090_D/fp8_scaled_mm_per_token:v3/{K=14336,N=5120,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_4090_D.json
rename to lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_GeForce_RTX_4090_D/scaled_mm_per_token:v1/{K=14336,N=5120,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_4090_D.json
diff --git a/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_GeForce_RTX_4090_D/fp8_scaled_mm_per_token:v3/{K=4096,N=5120,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_4090_D.json b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_GeForce_RTX_4090_D/scaled_mm_per_token:v1/{K=4096,N=5120,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_4090_D.json
similarity index 100%
rename from lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_GeForce_RTX_4090_D/fp8_scaled_mm_per_token:v3/{K=4096,N=5120,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_4090_D.json
rename to lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_GeForce_RTX_4090_D/scaled_mm_per_token:v1/{K=4096,N=5120,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_4090_D.json
diff --git a/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_GeForce_RTX_4090_D/fp8_scaled_mm_per_token:v3/{K=5120,N=2048,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_4090_D.json b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_GeForce_RTX_4090_D/scaled_mm_per_token:v1/{K=5120,N=2048,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_4090_D.json
similarity index 100%
rename from lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_GeForce_RTX_4090_D/fp8_scaled_mm_per_token:v3/{K=5120,N=2048,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_4090_D.json
rename to lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_GeForce_RTX_4090_D/scaled_mm_per_token:v1/{K=5120,N=2048,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_4090_D.json
diff --git a/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_GeForce_RTX_4090_D/fp8_scaled_mm_per_token:v3/{K=5120,N=28672,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_4090_D.json b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_GeForce_RTX_4090_D/scaled_mm_per_token:v1/{K=5120,N=28672,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_4090_D.json
similarity index 100%
rename from lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_GeForce_RTX_4090_D/fp8_scaled_mm_per_token:v3/{K=5120,N=28672,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_4090_D.json
rename to lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_GeForce_RTX_4090_D/scaled_mm_per_token:v1/{K=5120,N=28672,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_4090_D.json
diff --git a/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_GeForce_RTX_4090_D/fp8_scaled_mm_per_token:v3/{K=5120,N=4096,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_4090_D.json b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_GeForce_RTX_4090_D/scaled_mm_per_token:v1/{K=5120,N=4096,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_4090_D.json
similarity index 100%
rename from lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_GeForce_RTX_4090_D/fp8_scaled_mm_per_token:v3/{K=5120,N=4096,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_4090_D.json
rename to lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_GeForce_RTX_4090_D/scaled_mm_per_token:v1/{K=5120,N=4096,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_4090_D.json
diff --git a/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_GeForce_RTX_5090/fp8_scaled_mm_per_token:v3/{K=13824,N=5120,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_GeForce_RTX_5090/scaled_mm_per_token:v1/{K=13824,N=5120,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json
similarity index 100%
rename from lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_GeForce_RTX_5090/fp8_scaled_mm_per_token:v3/{K=13824,N=5120,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json
rename to lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_GeForce_RTX_5090/scaled_mm_per_token:v1/{K=13824,N=5120,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json
diff --git a/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_GeForce_RTX_5090/fp8_scaled_mm_per_token:v3/{K=14336,N=5120,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_GeForce_RTX_5090/scaled_mm_per_token:v1/{K=14336,N=5120,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json
similarity index 100%
rename from lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_GeForce_RTX_5090/fp8_scaled_mm_per_token:v3/{K=14336,N=5120,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json
rename to lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_GeForce_RTX_5090/scaled_mm_per_token:v1/{K=14336,N=5120,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json
diff --git a/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_GeForce_RTX_5090/fp8_scaled_mm_per_token:v3/{K=1536,N=1536,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_GeForce_RTX_5090/scaled_mm_per_token:v1/{K=1536,N=1536,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json
similarity index 100%
rename from lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_GeForce_RTX_5090/fp8_scaled_mm_per_token:v3/{K=1536,N=1536,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json
rename to lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_GeForce_RTX_5090/scaled_mm_per_token:v1/{K=1536,N=1536,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json
diff --git a/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_GeForce_RTX_5090/fp8_scaled_mm_per_token:v3/{K=1536,N=8960,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_GeForce_RTX_5090/scaled_mm_per_token:v1/{K=1536,N=8960,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json
similarity index 100%
rename from lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_GeForce_RTX_5090/fp8_scaled_mm_per_token:v3/{K=1536,N=8960,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json
rename to lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_GeForce_RTX_5090/scaled_mm_per_token:v1/{K=1536,N=8960,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json
diff --git a/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_GeForce_RTX_5090/fp8_scaled_mm_per_token:v3/{K=4096,N=5120,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_GeForce_RTX_5090/scaled_mm_per_token:v1/{K=4096,N=5120,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json
similarity index 100%
rename from lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_GeForce_RTX_5090/fp8_scaled_mm_per_token:v3/{K=4096,N=5120,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json
rename to lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_GeForce_RTX_5090/scaled_mm_per_token:v1/{K=4096,N=5120,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json
diff --git a/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_GeForce_RTX_5090/fp8_scaled_mm_per_token:v3/{K=5120,N=13824,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_GeForce_RTX_5090/scaled_mm_per_token:v1/{K=5120,N=13824,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json
similarity index 100%
rename from lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_GeForce_RTX_5090/fp8_scaled_mm_per_token:v3/{K=5120,N=13824,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json
rename to lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_GeForce_RTX_5090/scaled_mm_per_token:v1/{K=5120,N=13824,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json
diff --git a/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_GeForce_RTX_5090/fp8_scaled_mm_per_token:v3/{K=5120,N=2048,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_GeForce_RTX_5090/scaled_mm_per_token:v1/{K=5120,N=2048,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json
similarity index 100%
rename from lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_GeForce_RTX_5090/fp8_scaled_mm_per_token:v3/{K=5120,N=2048,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json
rename to lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_GeForce_RTX_5090/scaled_mm_per_token:v1/{K=5120,N=2048,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json
diff --git a/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_GeForce_RTX_5090/fp8_scaled_mm_per_token:v3/{K=5120,N=28672,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_GeForce_RTX_5090/scaled_mm_per_token:v1/{K=5120,N=28672,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json
similarity index 100%
rename from lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_GeForce_RTX_5090/fp8_scaled_mm_per_token:v3/{K=5120,N=28672,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json
rename to lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_GeForce_RTX_5090/scaled_mm_per_token:v1/{K=5120,N=28672,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json
diff --git a/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_GeForce_RTX_5090/fp8_scaled_mm_per_token:v3/{K=5120,N=4096,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_GeForce_RTX_5090/scaled_mm_per_token:v1/{K=5120,N=4096,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json
similarity index 100%
rename from lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_GeForce_RTX_5090/fp8_scaled_mm_per_token:v3/{K=5120,N=4096,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json
rename to lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_GeForce_RTX_5090/scaled_mm_per_token:v1/{K=5120,N=4096,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json
diff --git a/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_GeForce_RTX_5090/fp8_scaled_mm_per_token:v3/{K=5120,N=5120,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_GeForce_RTX_5090/scaled_mm_per_token:v1/{K=5120,N=5120,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json
similarity index 100%
rename from lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_GeForce_RTX_5090/fp8_scaled_mm_per_token:v3/{K=5120,N=5120,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json
rename to lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_GeForce_RTX_5090/scaled_mm_per_token:v1/{K=5120,N=5120,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json
diff --git a/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_GeForce_RTX_5090/fp8_scaled_mm_per_token:v3/{K=8960,N=1536,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_GeForce_RTX_5090/scaled_mm_per_token:v1/{K=8960,N=1536,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json
similarity index 100%
rename from lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_GeForce_RTX_5090/fp8_scaled_mm_per_token:v3/{K=8960,N=1536,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json
rename to lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_GeForce_RTX_5090/scaled_mm_per_token:v1/{K=8960,N=1536,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json
diff --git a/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_GeForce_RTX_5090/fp8_scaled_mm_per_token:v3/{N=14336,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_GeForce_RTX_5090/scaled_mm_per_token:v1/{N=14336,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json
similarity index 100%
rename from lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_GeForce_RTX_5090/fp8_scaled_mm_per_token:v3/{N=14336,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json
rename to lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_GeForce_RTX_5090/scaled_mm_per_token:v1/{N=14336,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json

From 164a299f1a816594c472bdf47781e853e3dbffb1 Mon Sep 17 00:00:00 2001
From: shihaobai <1798930569@qq.com>
Date: Thu, 15 Jan 2026 11:40:53 +0000
Subject: [PATCH 19/65] unit_test

---
 unit_tests/common/quantization/test_fp8_scaled_mm_per_token.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unit_tests/common/quantization/test_fp8_scaled_mm_per_token.py b/unit_tests/common/quantization/test_fp8_scaled_mm_per_token.py
index 2c0b7bf76..e6a0d52c7 100644
--- a/unit_tests/common/quantization/test_fp8_scaled_mm_per_token.py
+++ b/unit_tests/common/quantization/test_fp8_scaled_mm_per_token.py
@@ -1,7 +1,7 @@
 import torch
 import pytest
 import torch.nn.functional as F
-from lightllm.common.quantization.triton_quant.fp8.fp8w8a8_scaled_mm_per_token_kernel import fp8_scaled_mm_per_token
+from lightllm.common.basemodel.triton_kernel.quantization.scaled_mm_per_token_kernel import fp8_scaled_mm_per_token
 
 
 def is_fp8_native_supported():

From e301d4719c0a1e248b890de6d79047d88ea6479d Mon Sep 17 00:00:00 2001
From: shihaobai <1798930569@qq.com>
Date: Mon, 19 Jan 2026 13:47:40 +0000
Subject: [PATCH 20/65] fix

---
 .../common/basemodel/attention/fa3/fp8.py     |   2 +-
 .../layer_weights/base_layer_weight.py        |   9 -
 .../meta_weights/att_sink_weight.py           |   3 -
 .../meta_weights/embedding_weight.py          |  40 ++-
 .../fused_moe/fused_moe_weight_ep.py          |   5 +-
 .../fused_moe/fused_moe_weight_tp.py          |   1 +
 .../meta_weights/mm_weight/mm_weight.py       |   3 -
 .../layer_weights/meta_weights/norm_weight.py |  29 +-
 .../basemodel/triton_kernel/norm/__init__.py  |   0
 .../common/fused_moe/grouped_fused_moe.py     |   2 +-
 .../common/fused_moe/grouped_fused_moe_ep.py  |   2 +-
 lightllm/common/quantization/__init__.py      |  95 +++++-
 lightllm/common/quantization/awq.py           | 267 +++++++--------
 lightllm/common/quantization/deepgemm.py      | 133 ++++++++
 lightllm/common/quantization/fp8_block128.py  | 216 ------------
 lightllm/common/quantization/fp8_per_token.py | 172 ----------
 lightllm/common/quantization/no_quant.py      |   3 +-
 lightllm/common/quantization/registry.py      |  16 +-
 lightllm/common/quantization/w8a8.py          | 309 +++++++++---------
 .../pre_and_post_layer_weight.py              |  24 +-
 .../layer_infer/transformer_layer_infer.py    |  14 +-
 .../pre_and_post_layer_weight.py              |  23 +-
 .../pre_and_post_layer_weight.py              |  23 +-
 23 files changed, 581 insertions(+), 810 deletions(-)
 create mode 100644 lightllm/common/basemodel/triton_kernel/norm/__init__.py
 create mode 100644 lightllm/common/quantization/deepgemm.py
 delete mode 100644 lightllm/common/quantization/fp8_block128.py
 delete mode 100644 lightllm/common/quantization/fp8_per_token.py

diff --git a/lightllm/common/basemodel/attention/fa3/fp8.py b/lightllm/common/basemodel/attention/fa3/fp8.py
index 3feed1ef4..12b2b0dfa 100644
--- a/lightllm/common/basemodel/attention/fa3/fp8.py
+++ b/lightllm/common/basemodel/attention/fa3/fp8.py
@@ -4,7 +4,7 @@
 from typing import Optional, TYPE_CHECKING
 from lightllm.utils.sgl_utils import flash_attn_with_kvcache
 from lightllm.utils.envs_utils import get_env_start_args
-from lightllm.common.basemodel.triton_kernel.q_per_head_fp8_quant import q_per_head_fp8_quant
+from lightllm.common.basemodel.triton_kernel.quantization.q_per_head_fp8_quant import q_per_head_fp8_quant
 from lightllm.utils.vllm_utils import HAS_VLLM, vllm_ops
 from typing import Union
 from .fp import Fa3AttBackend, Fa3PrefillAttState, Fa3DecodeAttState
diff --git a/lightllm/common/basemodel/layer_weights/base_layer_weight.py b/lightllm/common/basemodel/layer_weights/base_layer_weight.py
index 1875e2c3b..6bdeb64d2 100644
--- a/lightllm/common/basemodel/layer_weights/base_layer_weight.py
+++ b/lightllm/common/basemodel/layer_weights/base_layer_weight.py
@@ -26,14 +26,5 @@ def init_static_params(self):
         """
         pass
 
-    def verify_load(self):
-        """
-        verify all load is ok
-        """
-        for attr_name in dir(self):
-            attr = getattr(self, attr_name)
-            if isinstance(attr, BaseWeight):
-                assert attr.verify_load(), f"Loading {attr_name} of layers {self.layer_num_} fails."
-
     def _cuda(self, cpu_tensor):
         return cpu_tensor.contiguous().to(self.data_type_).cuda(get_current_device_id())
diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/att_sink_weight.py b/lightllm/common/basemodel/layer_weights/meta_weights/att_sink_weight.py
index 3f8e1f50a..1c22bcb7d 100644
--- a/lightllm/common/basemodel/layer_weights/meta_weights/att_sink_weight.py
+++ b/lightllm/common/basemodel/layer_weights/meta_weights/att_sink_weight.py
@@ -18,6 +18,3 @@ def load_hf_weights(self, weights: Dict[str, torch.Tensor]):
         t_weight = weights[self.weight_name]
         start_head_index, end_head_index = self._get_head_tp_split_params(weight=t_weight)
         self.weight = t_weight[start_head_index:end_head_index].to(self.data_type_).cuda(get_current_device_id())
-
-    def verify_load(self):
-        return self.weight is not None
diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/embedding_weight.py b/lightllm/common/basemodel/layer_weights/meta_weights/embedding_weight.py
index e228d5c86..df9050d4f 100644
--- a/lightllm/common/basemodel/layer_weights/meta_weights/embedding_weight.py
+++ b/lightllm/common/basemodel/layer_weights/meta_weights/embedding_weight.py
@@ -44,18 +44,15 @@ def load_hf_weights(self, weights: Dict[str, torch.Tensor]):
     def _native_forward(
         self, input_ids: torch.Tensor, out: Optional[torch.Tensor] = None, _alloc_func=torch.empty
     ) -> torch.Tensor:
-        # Adjust input_ids for tp split
         adjusted_ids = input_ids - self.tp_vocab_start_id
-        # Clamp to valid range for this partition
         adjusted_ids = torch.clamp(adjusted_ids, 0, self.weight.shape[0] - 1)
-        # Use PyTorch native embedding
         result = torch.nn.functional.embedding(adjusted_ids, self.weight)
         if out is not None:
             out.copy_(result)
             return out
         return result
 
-    def _cuda_forward(
+    def _triton_forward(
         self, input_ids: torch.Tensor, out: Optional[torch.Tensor] = None, alloc_func=torch.empty
     ) -> torch.Tensor:
         if out is None:
@@ -71,6 +68,17 @@ def _cuda_forward(
         )
         return out
 
+    def _cuda_forward(
+        self, input_ids: torch.Tensor, out: Optional[torch.Tensor] = None, alloc_func=torch.empty
+    ) -> torch.Tensor:
+        return self._triton_forward(input_ids=input_ids, out=out, alloc_func=alloc_func)
+
+    def _musa_forward(
+        self, input_ids: torch.Tensor, out: Optional[torch.Tensor] = None, alloc_func=torch.empty
+    ) -> torch.Tensor:
+        # triton implementation is supported by musa.
+        return self._triton_forward(input_ids=input_ids, out=out, alloc_func=alloc_func)
+
     def __call__(
         self, input_ids: torch.Tensor, out: Optional[torch.Tensor] = None, alloc_func=torch.empty
     ) -> torch.Tensor:
@@ -84,7 +92,7 @@ def __init__(
         vocab_size: int,
         weight_name: str,
         data_type: torch.dtype,
-        shared_weight: Optional[EmbeddingWeight] = None,
+        embedding_weight: Optional[EmbeddingWeight] = None,
     ):
         super().__init__()
         self.dim = dim
@@ -97,23 +105,19 @@ def __init__(
         self.tp_vocab_end_id = int(split_indexes[self.tp_rank_ + 1])
         self.weight_name: str = weight_name
         self.data_type_ = data_type
-        self._shared_weight = shared_weight
-        if shared_weight is None:
-            self._create_weight()
-
-    @property
-    def weight(self) -> torch.Tensor:
-        if self._shared_weight is not None:
-            return self._shared_weight.weight
-        return self._weight
+        self._embedding_weight = embedding_weight
+        self._create_weight()
 
     def _create_weight(self):
+        if self._embedding_weight is not None:
+            self.weight = self._embedding_weight.weight
+            return
         tp_vocab_size = self.tp_vocab_end_id - self.tp_vocab_start_id
-        self._weight: torch.Tensor = torch.empty(tp_vocab_size, self.dim, dtype=self.data_type_, device=self.device_id_)
+        self.weight: torch.Tensor = torch.empty(tp_vocab_size, self.dim, dtype=self.data_type_, device=self.device_id_)
 
     def load_hf_weights(self, weights: Dict[str, torch.Tensor]):
-        # When using shared weight, no need to load - EmbeddingWeight already loaded it
-        if self._shared_weight is not None:
+        # When set tile_embedding=True, no need to load - EmbeddingWeight already loaded it
+        if self._embedding_weight is not None:
             return
         if self.weight_name not in weights:
             return
@@ -123,7 +127,7 @@ def load_hf_weights(self, weights: Dict[str, torch.Tensor]):
             loaded_vocab_size == self.vocab_size
         ), f"loaded weight vocab_size: {loaded_vocab_size} != expected vocab_size: {self.vocab_size}"
         logger.info(f"loaded weight vocab_size: {self.vocab_size}")
-        self._weight.copy_(t_weight[self.tp_vocab_start_id : self.tp_vocab_end_id, :].to(self.data_type_))
+        self.weight.copy_(t_weight[self.tp_vocab_start_id : self.tp_vocab_end_id, :].to(self.data_type_))
 
     def _native_forward(
         self, input: torch.Tensor, out: Optional[torch.Tensor] = None, _alloc_func=torch.empty
diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/fused_moe_weight_ep.py b/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/fused_moe_weight_ep.py
index a84d19893..342026de2 100644
--- a/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/fused_moe_weight_ep.py
+++ b/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/fused_moe_weight_ep.py
@@ -15,7 +15,7 @@
 from lightllm.utils.envs_utils import get_deepep_num_max_dispatch_tokens_per_rank
 from lightllm.utils.envs_utils import get_redundancy_expert_ids, get_redundancy_expert_num
 from lightllm.utils.envs_utils import get_env_start_args
-from lightllm.common.quantization.triton_quant.fp8.fp8act_quant_kernel import (
+from lightllm.common.basemodel.triton_kernel.quantization.fp8act_quant_kernel import (
     per_token_group_quant_fp8,
     tma_align_input_scale,
 )
@@ -741,6 +741,3 @@ def _cuda(self, cpu_tensor):
         if self.quantized_weight:
             return cpu_tensor.contiguous().cuda(self.device_id_)
         return cpu_tensor.contiguous().to(self.data_type_).cuda(self.device_id_)
-
-    def verify_load(self):
-        return self.w1 is not None and self.w2 is not None
diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/fused_moe_weight_tp.py b/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/fused_moe_weight_tp.py
index c6b3dc965..876dc44bd 100644
--- a/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/fused_moe_weight_tp.py
+++ b/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/fused_moe_weight_tp.py
@@ -90,6 +90,7 @@ def __init__(
         self.num_fused_shared_experts = num_fused_shared_experts
         self.routed_scaling_factor = network_config.get("routed_scaling_factor", 1.0)
         self.split_inter_size = split_inter_size
+        self.data_type_ = data_type
         self.hidden_size = network_config.get("hidden_size")
         self.e_score_correction_bias = None
         self.scoring_func = network_config.get("scoring_func", "softmax")
diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/mm_weight.py b/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/mm_weight.py
index a7288b818..728ed82fa 100644
--- a/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/mm_weight.py
+++ b/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/mm_weight.py
@@ -120,9 +120,6 @@ def load_hf_weights(self, weights):
             for sub_child_index, param_name in enumerate(self.weight_zero_point_names):
                 self._load_weight_zero_point(param_name=param_name, weights=weights, sub_child_index=sub_child_index)
 
-    def verify_load(self) -> bool:
-        return True
-
     def _create_weight(self):
         self.bias = None
         if self.bias_names is not None:
diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/norm_weight.py b/lightllm/common/basemodel/layer_weights/meta_weights/norm_weight.py
index df12ec9b1..d7bbe5567 100644
--- a/lightllm/common/basemodel/layer_weights/meta_weights/norm_weight.py
+++ b/lightllm/common/basemodel/layer_weights/meta_weights/norm_weight.py
@@ -2,9 +2,9 @@
 from typing import Optional, Dict
 from .base_weight import BaseWeightTpl
 from lightllm.utils.dist_utils import get_current_device_id, get_current_rank_in_dp, get_dp_world_size
-from lightllm.common.basemodel.triton_kernel.rmsnorm import rmsnorm_forward
-from lightllm.common.basemodel.triton_kernel.layernorm import layernorm_forward
-from lightllm.common.basemodel.triton_kernel.qk_norm import qk_rmsnorm_forward
+from lightllm.common.basemodel.triton_kernel.norm.rmsnorm import rmsnorm_forward
+from lightllm.common.basemodel.triton_kernel.norm.layernorm import layernorm_forward
+from lightllm.common.basemodel.triton_kernel.norm.qk_norm import qk_rmsnorm_forward
 from .platform_op import PlatformAwareOp
 
 
@@ -53,6 +53,12 @@ def _cuda_forward(
         # only triton implementation is supported for rmsnorm on cuda platform
         return self._triton_forward(input=input, eps=eps, out=out, alloc_func=alloc_func)
 
+    def _musa_forward(
+        self, input: torch.Tensor, eps: float, out: Optional[torch.Tensor] = None, alloc_func=torch.empty
+    ) -> torch.Tensor:
+        # triton implementation is supported by musa.
+        return self._triton_forward(input=input, eps=eps, out=out, alloc_func=alloc_func)
+
     def __call__(
         self, input: torch.Tensor, eps: float, out: Optional[torch.Tensor] = None, alloc_func=torch.empty
     ) -> torch.Tensor:
@@ -105,6 +111,12 @@ def _cuda_forward(
         # only triton implementation is supported for layernorm on cuda platform
         return self._triton_forward(input=input, eps=eps, out=out, alloc_func=alloc_func)
 
+    def _musa_forward(
+        self, input: torch.Tensor, eps: float, out: Optional[torch.Tensor] = None, alloc_func=torch.empty
+    ) -> torch.Tensor:
+        # triton implementation is supported by musa.
+        return self._triton_forward(input=input, eps=eps, out=out, alloc_func=alloc_func)
+
     def __call__(
         self, input: torch.Tensor, eps: float, out: Optional[torch.Tensor] = None, alloc_func=torch.empty
     ) -> torch.Tensor:
@@ -188,15 +200,22 @@ def _native_forward(
         input.copy_(x)
         return
 
+    def _triton_forward(self, input: torch.Tensor, eps: float) -> torch.Tensor:
+        assert input.ndim == 2 and self.weight.ndim == 1
+        return qk_rmsnorm_forward(x=input, weight=self.weight, eps=eps)
+
     def _cuda_forward(
         self,
         input: torch.Tensor,
         eps: float,
     ) -> None:
-        assert input.ndim == 2 and self.weight.ndim == 1
-        qk_rmsnorm_forward(x=input, weight=self.weight, eps=eps)
+        self._triton_forward(input=input, eps=eps)
         return
 
+    def _musa_forward(self, input: torch.Tensor, eps: float) -> torch.Tensor:
+        # musa implementation is supported by musa triton on musa platform
+        return self._triton_forward(input=input, eps=eps)
+
     def __call__(
         self,
         input: torch.Tensor,
diff --git a/lightllm/common/basemodel/triton_kernel/norm/__init__.py b/lightllm/common/basemodel/triton_kernel/norm/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/lightllm/common/fused_moe/grouped_fused_moe.py b/lightllm/common/fused_moe/grouped_fused_moe.py
index 758d83ba3..f29d3a2a0 100644
--- a/lightllm/common/fused_moe/grouped_fused_moe.py
+++ b/lightllm/common/fused_moe/grouped_fused_moe.py
@@ -28,7 +28,7 @@
 from .moe_kernel_configs import MoeGroupedGemmKernelConfig
 from .moe_silu_and_mul import silu_and_mul_fwd
 from .moe_sum_reduce import moe_sum_reduce
-from lightllm.common.quantization.triton_quant.fp8.fp8act_quant_kernel import per_token_group_quant_fp8
+from lightllm.common.basemodel.triton_kernel.quantization.fp8act_quant_kernel import per_token_group_quant_fp8
 from lightllm.utils.torch_ops_utils import direct_register_custom_op
 from lightllm.common.triton_utils.autotuner import autotune
 
diff --git a/lightllm/common/fused_moe/grouped_fused_moe_ep.py b/lightllm/common/fused_moe/grouped_fused_moe_ep.py
index 5cc0d7a9b..2a577890b 100644
--- a/lightllm/common/fused_moe/grouped_fused_moe_ep.py
+++ b/lightllm/common/fused_moe/grouped_fused_moe_ep.py
@@ -8,7 +8,7 @@
 from lightllm.utils.log_utils import init_logger
 from lightllm.common.fused_moe.moe_silu_and_mul import silu_and_mul_fwd
 from lightllm.common.fused_moe.moe_silu_and_mul_mix_quant_ep import silu_and_mul_masked_post_quant_fwd
-from lightllm.common.quantization.triton_quant.fp8.fp8act_quant_kernel import (
+from lightllm.common.basemodel.triton_kernel.quantization.fp8act_quant_kernel import (
     per_token_group_quant_fp8,
     tma_align_input_scale,
 )
diff --git a/lightllm/common/quantization/__init__.py b/lightllm/common/quantization/__init__.py
index 8cbcc2e68..bf99622ef 100644
--- a/lightllm/common/quantization/__init__.py
+++ b/lightllm/common/quantization/__init__.py
@@ -1,13 +1,82 @@
-from .no_quant import NoQuantization
-from .fp8_block128 import FP8Block128Quantization
-from .fp8_per_token import FP8PerTokenQuantization
-from .w8a8 import W8A8Quantization
-from .awq import AWQQuantization
-
-__all__ = [
-    "NoQuantization",
-    "FP8Block128Quantization",
-    "FP8PerTokenQuantization",
-    "W8A8Quantization",
-    "AWQQuantization",
-]
+import yaml
+import collections
+from .registry import QUANTMETHODS
+from .w8a8 import *
+from .deepgemm import *
+from .awq import *
+from .no_quant import *
+from lightllm.utils.log_utils import init_logger
+
+logger = init_logger(__name__)
+
+
+class Quantcfg:
+    def __init__(self, network_config, quant_type="none", custom_cfg_path=None):
+        self.layer_num = network_config["n_layer"]
+        self.quant_type = quant_type
+        self.network_config_ = network_config
+        self._parse_custom_cfg(custom_cfg_path)
+        self._parse_network_config(network_config)
+
+    def _parse_network_config(self, network_config):
+        hf_quantization_config = network_config.get("quantization_config", None)
+        if hf_quantization_config is None:
+            self.quantized_weight = False
+            self.static_activation = False
+            self.hf_quantization_config = None
+            return
+        self.quantized_weight = True
+        activation_scheme = network_config.get("activation_scheme", "dynamic")
+        self.static_activation = activation_scheme == "static"
+        self.hf_quantization_config = hf_quantization_config
+        self.hf_quantization_method = hf_quantization_config["quant_method"]
+        self._mapping_quant_method()
+
+    def _mapping_quant_method(self):
+        if self.hf_quantization_method == "fp8":
+            block_size = self.hf_quantization_config.get("weight_block_size", None)
+            if block_size == [128, 128]:
+                from lightllm.common.quantization.deepgemm_quant import HAS_DEEPGEMM
+
+                if HAS_DEEPGEMM:
+                    self.quant_type = "deepgemm-fp8w8a8-b128"
+                else:
+                    self.quant_type = "vllm-fp8w8a8-b128"
+                logger.info(f"select fp8w8a8-b128 quant way: {self.quant_type}")
+        elif self.hf_quantization_method == "awq":
+            self.quant_type = "awq"
+            if is_awq_marlin_compatible(self.hf_quantization_config):
+                self.quant_type = "awq_marlin"
+            logger.info(f"select awq quant way: {self.quant_type}")
+        else:
+            # TODO: more quant method
+            pass
+
+    def _parse_custom_cfg(self, custom_cfg_path):
+        self.quant_cfg = collections.defaultdict(dict)
+        if custom_cfg_path is None:
+            return
+
+        with open(custom_cfg_path, "r") as file:
+            data = yaml.safe_load(file)
+
+        self.quant_type = data["quant_type"]
+        for layer_quant_cfg in data.get("mix_bits", []):
+            name = layer_quant_cfg["name"]
+            layer_nums = layer_quant_cfg.get("layer_nums", range(self.layer_num))
+            layer_quant_type = layer_quant_cfg["quant_type"]
+            for layer_num in layer_nums:
+                self.quant_cfg[layer_num].update({name: layer_quant_type})
+
+    def get_quant_type(self, layer_num, name):
+        layer_config = self.quant_cfg.get(layer_num, None)
+        if layer_config is None:
+            return self.quant_type
+        quant_type = layer_config.get(name, self.quant_type)
+        return quant_type
+
+    def get_quant_method(self, layer_num, name):
+        quant_type = self.get_quant_type(layer_num, name)
+        quant_method = QUANTMETHODS.get(quant_type)
+        quant_method.hf_quantization_config = self.hf_quantization_config
+        return quant_method
diff --git a/lightllm/common/quantization/awq.py b/lightllm/common/quantization/awq.py
index eedc5b67b..ddb7674dd 100644
--- a/lightllm/common/quantization/awq.py
+++ b/lightllm/common/quantization/awq.py
@@ -39,40 +39,37 @@
     TYPE_MAP = {}
 
 
-def is_awq_marlin_compatible(quantization_config: dict[str, Any]) -> bool:
-    if not HAS_VLLM:
-        return False
-
-    quant_method = quantization_config.get("quant_method", "").lower()
-    num_bits = quantization_config.get("bits")
-    group_size = quantization_config.get("group_size")
-    zero_point = quantization_config.get("zero_point")
-
-    if not torch.cuda.is_available():
-        return False
+class AWQBaseQuantizationMethod(QuantizationMethod):
+    def __init__(self):
+        super().__init__()
+        assert HAS_VLLM, "vllm are not installed, you can't use quant api of them."
+        from lightllm.common.basemodel.layer_infer.cache_tensor_manager import g_cache_manager
 
-    if quant_method != "awq":
-        return False
+        self.cache_manager = g_cache_manager
 
-    if num_bits is None or group_size is None or zero_point is None:
-        return False
+    def quantize(self, weight: torch.Tensor, output: WeightPack, offset: int = 0):
+        raise NotImplementedError("AWQ online quantization is not supported yet.")
 
-    if num_bits not in TYPE_MAP:
-        return False
+    def apply(
+        self,
+        input_tensor: torch.Tensor,
+        weight_pack: WeightPack,
+        out: Optional[torch.Tensor] = None,
+        workspace: Optional[torch.Tensor] = None,
+        use_custom_tensor_mananger: bool = True,
+        bias: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        raise NotImplementedError("AWQ online quantization is not supported yet.")
 
-    return check_marlin_supported(quant_type=TYPE_MAP[num_bits], group_size=group_size, has_zp=zero_point)
+    @property
+    def method_name(self):
+        return "awq-base"
 
 
-@QUANTMETHODS.register(["awq", "awq_marlin"])
-class AWQQuantization(QuantizationMethod):
+@QUANTMETHODS.register("awq", platform="cuda")
+class AWQW4A16QuantizationMethod(AWQBaseQuantizationMethod):
     def __init__(self):
         super().__init__()
-        from lightllm.common.basemodel.layer_infer.cache_tensor_manager import g_cache_manager
-
-        if not HAS_VLLM:
-            raise RuntimeError("vLLM is required for AWQ quantization but is not installed.")
-
-        self.cache_manager = g_cache_manager
         self.pack_factor = 8
         self.weight_scale_suffix = "scales"
         self.weight_zero_point_suffix = "qzeros"
@@ -80,38 +77,11 @@ def __init__(self):
         self.has_weight_scale = True
         self.has_weight_zero_point = True
 
-        self._use_marlin = False
-        self._marlin_initialized = False
-
-    def _init_marlin(self):
-        if self._marlin_initialized:
-            return
-
-        self.nbits = 4
-        self.g_idx = marlin_make_empty_g_idx(torch.device("cuda"))
-        self.g_idx_sort_indices = marlin_make_empty_g_idx(torch.device("cuda"))
-        self.workspace = marlin_make_workspace_new(torch.device("cuda"))
-        self.vllm_quant_type = TYPE_MAP[self.nbits]
-        self.tile_size = 16
-        self._marlin_initialized = True
-
-    def _check_and_set_marlin(self):
-        if self.hf_quantization_config is None:
-            self._use_marlin = False
-            return
-
-        self._use_marlin = is_awq_marlin_compatible(self.hf_quantization_config)
-        if self._use_marlin:
-            self._init_marlin()
-            logger.info("AWQQuantization using Marlin backend")
-        else:
-            logger.info("AWQQuantization using basic AWQ backend")
-
     @property
     def method_name(self):
         return "awq"
 
-    def quantize(self, weight: torch.Tensor, output: WeightPack, offset: int = 0) -> None:
+    def quantize(self, weight: torch.Tensor, output: WeightPack, offset: int = 0):
         raise NotImplementedError("AWQ online quantization is not supported yet.")
 
     def apply(
@@ -122,22 +92,6 @@ def apply(
         workspace: Optional[torch.Tensor] = None,
         use_custom_tensor_mananger: bool = True,
         bias: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
-        if not hasattr(self, "_checked_marlin"):
-            self._check_and_set_marlin()
-            self._checked_marlin = True
-
-        if self._use_marlin:
-            return self._apply_marlin(input_tensor, weight_pack, out, bias)
-        else:
-            return self._apply_basic(input_tensor, weight_pack, out, bias)
-
-    def _apply_basic(
-        self,
-        input_tensor: torch.Tensor,
-        weight_pack: WeightPack,
-        out: Optional[torch.Tensor],
-        bias: Optional[torch.Tensor],
     ) -> torch.Tensor:
         qweight = weight_pack.weight
         weight_scale = weight_pack.weight_scale
@@ -154,12 +108,81 @@ def _apply_basic(
             out.add_(bias)
         return out
 
-    def _apply_marlin(
+    def create_weight(
+        self, out_dim: int, in_dim: int, dtype: torch.dtype, device_id: int, num_experts: int = 1
+    ) -> WeightPack:
+        group_size = self.hf_quantization_config["group_size"]
+        expert_prefix = (num_experts,) if num_experts > 1 else ()
+        weight = torch.empty(expert_prefix + (in_dim, out_dim // self.pack_factor), dtype=torch.int32).cuda(device_id)
+        weight_scale = torch.empty(expert_prefix + (in_dim // group_size, out_dim), dtype=dtype).cuda(device_id)
+        weight_zero_point = torch.empty(
+            expert_prefix + (in_dim // group_size, out_dim // self.pack_factor), dtype=torch.int32
+        ).cuda(device_id)
+        return WeightPack(weight=weight, weight_scale=weight_scale, weight_zero_point=weight_zero_point)
+
+    def load_weight(self, weight: torch.Tensor, weight_pack: WeightPack, start_idx: int) -> None:
+        start_idx = start_idx // self.pack_factor
+        weight_pack.weight[:, start_idx : start_idx + weight.shape[1]].copy_(weight)
+        return
+
+    def load_weight_scale(self, weight_scale: torch.Tensor, weight_pack: WeightPack, start_idx: int) -> None:
+        weight_pack.weight_scale[:, start_idx : start_idx + weight_scale.shape[1]].copy_(weight_scale)
+        return
+
+    def load_weight_zero_point(self, weight_zero_point: torch.Tensor, weight_pack: WeightPack, start_idx: int) -> None:
+        start_idx = start_idx // self.pack_factor
+        end_idx = start_idx + weight_zero_point.shape[1]
+        weight_pack.weight_zero_point[:, start_idx:end_idx].copy_(weight_zero_point)
+        return
+
+
+@QUANTMETHODS.register("awq_marlin", platform="cuda")
+class AWQMARLINW4A16QuantizationMethod(AWQBaseQuantizationMethod):
+    def __init__(self):
+        super().__init__()
+        self.pack_factor = 8
+        self.nbits = 4
+        self.weight_scale_suffix = "scales"
+        self.weight_zero_point_suffix = "qzeros"
+        self.weight_suffix = "qweight"
+        self.g_idx = marlin_make_empty_g_idx(torch.device("cuda"))
+        self.g_idx_sort_indices = marlin_make_empty_g_idx(torch.device("cuda"))
+        self.workspace = marlin_make_workspace_new(torch.device("cuda"))
+        self.vllm_quant_type = TYPE_MAP[self.nbits]
+        self.has_weight_scale = True
+        self.has_weight_zero_point = True
+        self.tile_size = 16
+
+    @property
+    def method_name(self):
+        return "awq_marlin"
+
+    def quantize(self, weight: torch.Tensor, offset: int = 0) -> WeightPack:
+        raise NotImplementedError("AWQ online quantization is not supported yet.")
+
+    def params_repack(
+        self, weight: torch.Tensor, weight_scale: torch.Tensor, weight_zero_point: torch.Tensor, dtype_type: torch.dtype
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        一些量化方法在将参数完成量化后，为了加速性能，还需要将参数进行重拍，使算子性能达到最优，如awq方法。
+        """
+        weight = self._process_weight_after_loading(weight.cuda(get_current_device_id()))
+        weight_scale = self._process_weight_scale_after_loading(
+            weight_scale.cuda(get_current_device_id()).to(dtype_type)
+        )
+        weight_zero_point = self._process_weight_zero_point_after_loading(
+            weight_zero_point.cuda(get_current_device_id())
+        )
+        return weight, weight_scale, weight_zero_point
+
+    def apply(
         self,
         input_tensor: torch.Tensor,
         weight_pack: WeightPack,
-        out: Optional[torch.Tensor],
-        bias: Optional[torch.Tensor],
+        out: Optional[torch.Tensor] = None,
+        workspace: Optional[torch.Tensor] = None,
+        use_custom_tensor_mananger: bool = True,
+        bias: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         qweight = weight_pack.weight
         weight_scale = weight_pack.weight_scale
@@ -200,30 +223,6 @@ def _apply_marlin(
 
     def create_weight(
         self, out_dim: int, in_dim: int, dtype: torch.dtype, device_id: int, num_experts: int = 1
-    ) -> WeightPack:
-        if not hasattr(self, "_checked_marlin"):
-            self._check_and_set_marlin()
-            self._checked_marlin = True
-
-        if self._use_marlin:
-            return self._create_weight_marlin(out_dim, in_dim, dtype, device_id, num_experts)
-        else:
-            return self._create_weight_basic(out_dim, in_dim, dtype, device_id, num_experts)
-
-    def _create_weight_basic(
-        self, out_dim: int, in_dim: int, dtype: torch.dtype, device_id: int, num_experts: int = 1
-    ) -> WeightPack:
-        group_size = self.hf_quantization_config["group_size"]
-        expert_prefix = (num_experts,) if num_experts > 1 else ()
-        weight = torch.empty(expert_prefix + (in_dim, out_dim // self.pack_factor), dtype=torch.int32).cuda(device_id)
-        weight_scale = torch.empty(expert_prefix + (in_dim // group_size, out_dim), dtype=dtype).cuda(device_id)
-        weight_zero_point = torch.empty(
-            expert_prefix + (in_dim // group_size, out_dim // self.pack_factor), dtype=torch.int32
-        ).cuda(device_id)
-        return WeightPack(weight=weight, weight_scale=weight_scale, weight_zero_point=weight_zero_point)
-
-    def _create_weight_marlin(
-        self, out_dim: int, in_dim: int, dtype: torch.dtype, device_id: int, num_experts: int = 1
     ) -> WeightPack:
         self.n = out_dim
         self.k = in_dim
@@ -239,20 +238,6 @@ def _create_weight_marlin(
         return WeightPack(weight=weight, weight_scale=weight_scale, weight_zero_point=weight_zero_point)
 
     def load_weight(self, weight: torch.Tensor, weight_pack: WeightPack, start_idx: int) -> None:
-        if not hasattr(self, "_checked_marlin"):
-            self._check_and_set_marlin()
-            self._checked_marlin = True
-
-        if self._use_marlin:
-            self._load_weight_marlin(weight, weight_pack, start_idx)
-        else:
-            self._load_weight_basic(weight, weight_pack, start_idx)
-
-    def _load_weight_basic(self, weight: torch.Tensor, weight_pack: WeightPack, start_idx: int) -> None:
-        start_idx = start_idx // self.pack_factor
-        weight_pack.weight[:, start_idx : start_idx + weight.shape[1]].copy_(weight)
-
-    def _load_weight_marlin(self, weight: torch.Tensor, weight_pack: WeightPack, start_idx: int) -> None:
         assert self.hf_quantization_config is not None, "hf_quantization_config is not set"
         device_id = get_current_device_id()
         repack_weight = vllm_ops.awq_marlin_repack(
@@ -263,21 +248,9 @@ def _load_weight_marlin(self, weight: torch.Tensor, weight_pack: WeightPack, sta
         )
         start_idx = start_idx // self.pack_factor * self.tile_size
         weight_pack.weight[:, start_idx : start_idx + repack_weight.shape[1]].copy_(repack_weight)
+        return
 
     def load_weight_scale(self, weight_scale: torch.Tensor, weight_pack: WeightPack, start_idx: int) -> None:
-        if not hasattr(self, "_checked_marlin"):
-            self._check_and_set_marlin()
-            self._checked_marlin = True
-
-        if self._use_marlin:
-            self._load_weight_scale_marlin(weight_scale, weight_pack, start_idx)
-        else:
-            self._load_weight_scale_basic(weight_scale, weight_pack, start_idx)
-
-    def _load_weight_scale_basic(self, weight_scale: torch.Tensor, weight_pack: WeightPack, start_idx: int) -> None:
-        weight_pack.weight_scale[:, start_idx : start_idx + weight_scale.shape[1]].copy_(weight_scale)
-
-    def _load_weight_scale_marlin(self, weight_scale: torch.Tensor, weight_pack: WeightPack, start_idx: int) -> None:
         assert self.hf_quantization_config is not None, "hf_quantization_config is not set"
         group_size = self.hf_quantization_config["group_size"]
         device_id = get_current_device_id()
@@ -288,27 +261,9 @@ def _load_weight_scale_marlin(self, weight_scale: torch.Tensor, weight_pack: Wei
             group_size=self.hf_quantization_config["group_size"],
         )
         weight_pack.weight_scale[:, start_idx : start_idx + repack_weight_scale.shape[1]].copy_(repack_weight_scale)
+        return
 
     def load_weight_zero_point(self, weight_zero_point: torch.Tensor, weight_pack: WeightPack, start_idx: int) -> None:
-        if not hasattr(self, "_checked_marlin"):
-            self._check_and_set_marlin()
-            self._checked_marlin = True
-
-        if self._use_marlin:
-            self._load_weight_zero_point_marlin(weight_zero_point, weight_pack, start_idx)
-        else:
-            self._load_weight_zero_point_basic(weight_zero_point, weight_pack, start_idx)
-
-    def _load_weight_zero_point_basic(
-        self, weight_zero_point: torch.Tensor, weight_pack: WeightPack, start_idx: int
-    ) -> None:
-        start_idx = start_idx // self.pack_factor
-        end_idx = start_idx + weight_zero_point.shape[1]
-        weight_pack.weight_zero_point[:, start_idx:end_idx].copy_(weight_zero_point)
-
-    def _load_weight_zero_point_marlin(
-        self, weight_zero_point: torch.Tensor, weight_pack: WeightPack, start_idx: int
-    ) -> None:
         device_id = get_current_device_id()
         repack_weight_zero_point = awq_to_marlin_zero_points(
             weight_zero_point.cuda(device_id),
@@ -320,3 +275,29 @@ def _load_weight_zero_point_marlin(
         weight_pack.weight_zero_point[:, start_idx : start_idx + repack_weight_zero_point.shape[1]].copy_(
             repack_weight_zero_point
         )
+        return
+
+
+# adapted from
+# https://github.com/vllm-project/vllm/blob/aef368aa08572505b820db01da82e2fbb3d43a72/vllm/model_executor/layers/quantization/awq_marlin.py#L211-L212
+def is_awq_marlin_compatible(quantization_config: dict[str, Any]):
+    # Extract data from quant config.
+    quant_method = quantization_config.get("quant_method", "").lower()
+    num_bits = quantization_config.get("bits")
+    group_size = quantization_config.get("group_size")
+    zero_point = quantization_config.get("zero_point")
+
+    if not torch.cuda.is_available():
+        return False
+
+    if quant_method != "awq":
+        return False
+
+    # If we cannot find the info needed in the config, cannot convert.
+    if num_bits is None or group_size is None or zero_point is None:
+        return False
+
+    if num_bits not in TYPE_MAP:
+        return False
+
+    return check_marlin_supported(quant_type=TYPE_MAP[num_bits], group_size=group_size, has_zp=zero_point)
diff --git a/lightllm/common/quantization/deepgemm.py b/lightllm/common/quantization/deepgemm.py
new file mode 100644
index 000000000..c9f227120
--- /dev/null
+++ b/lightllm/common/quantization/deepgemm.py
@@ -0,0 +1,133 @@
+import torch
+from typing import Optional
+
+from lightllm.common.quantization.quantize_method import QuantizationMethod, WeightPack
+from lightllm.common.quantization.registry import QUANTMETHODS
+from lightllm.common.basemodel.triton_kernel.quantization.fp8act_quant_kernel import per_token_group_quant_fp8
+from lightllm.utils.log_utils import init_logger
+
+logger = init_logger(__name__)
+
+try:
+    import deep_gemm
+
+    HAS_DEEPGEMM = True
+except ImportError:
+    HAS_DEEPGEMM = False
+
+
+class DeepGEMMBaseQuantizationMethod(QuantizationMethod):
+    def __init__(self):
+        super().__init__()
+        from lightllm.common.basemodel.layer_infer.cache_tensor_manager import g_cache_manager
+
+        self.cache_manager = g_cache_manager
+        assert HAS_DEEPGEMM, "deepgemm is not installed, you can't use quant api of it"
+
+    def quantize(self, weight: torch.Tensor, output: WeightPack, offset: int = 0):
+        raise NotImplementedError("Not implemented")
+
+    def apply(
+        self,
+        input_tensor: torch.Tensor,
+        weight_pack: WeightPack,
+        out: Optional[torch.Tensor] = None,
+        workspace: Optional[torch.Tensor] = None,
+        use_custom_tensor_mananger: bool = True,
+        bias: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        raise NotImplementedError("Not implemented")
+
+    @property
+    def method_name(self):
+        return "deepgemm-base"
+
+
+@QUANTMETHODS.register(["deepgemm-fp8w8a8-b128"], platform="cuda")
+class DeepGEMMFP8w8a8B128QuantizationMethod(DeepGEMMBaseQuantizationMethod):
+    def __init__(self):
+        super().__init__()
+        self.block_size = 128
+        self.weight_suffix = None
+        self.weight_zero_point_suffix = None
+        self.weight_scale_suffix = "weight_scale_inv"
+        self.has_weight_scale = True
+        self.has_weight_zero_point = False
+
+    @property
+    def method_name(self):
+        return "deepgemm-fp8w8a8-b128"
+
+    def quantize(self, weight: torch.Tensor, output: WeightPack, offset: int = 0):
+        from lightllm.common.basemodel.triton_kernel.quantization.fp8w8a8_block_quant_kernel import weight_quant
+
+        device = output.weight.device
+        weight, scale = weight_quant(weight.cuda(device), self.block_size)
+        output.weight[offset : offset + weight.shape[0], :].copy_(weight)
+        output.weight_scale[offset // self.block_size : offset + weight.shape[0] // self.block_size].copy_(scale)
+        return
+
+    def apply(
+        self,
+        input_tensor: torch.Tensor,
+        weight_pack: "WeightPack",
+        out: Optional[torch.Tensor] = None,
+        workspace: Optional[torch.Tensor] = None,
+        use_custom_tensor_mananger: bool = True,
+        bias: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        qweight = weight_pack.weight
+        weight_scale = weight_pack.weight_scale
+        input_scale = None
+        alloc_func = torch.empty if not use_custom_tensor_mananger else self.cache_manager.empty
+        m, k = input_tensor.shape
+        n = qweight.shape[0]
+        if input_scale is None:
+            qinput_tensor, input_scale = per_token_group_quant_fp8(
+                input_tensor,
+                self.block_size,
+                dtype=qweight.dtype,
+                column_major_scales=True,
+                scale_tma_aligned=True,
+                alloc_func=alloc_func,
+            )
+
+        if out is None:
+            out = alloc_func((m, n), dtype=input_tensor.dtype, device=input_tensor.device)
+        _deepgemm_fp8_nt((qinput_tensor, input_scale), (qweight, weight_scale), out)
+        return out
+
+    def create_weight(
+        self, out_dim: int, in_dim: int, dtype: torch.dtype, device_id: int, num_experts: int = 1
+    ) -> WeightPack:
+        expert_prefix = (num_experts,) if num_experts > 1 else ()
+        weight = torch.empty(expert_prefix + (out_dim, in_dim), dtype=torch.float8_e4m3fn).cuda(device_id)
+        weight_scale = torch.empty(
+            expert_prefix + (out_dim // self.block_size, in_dim // self.block_size), dtype=torch.float32
+        ).cuda(device_id)
+        return WeightPack(weight=weight, weight_scale=weight_scale)
+
+    def load_weight(self, weight: torch.Tensor, weight_pack: WeightPack, start_idx: int) -> None:
+        weight_pack.weight[start_idx : start_idx + weight.shape[0]].copy_(weight)
+        return
+
+    def load_weight_scale(self, weight_scale: torch.Tensor, weight_pack: WeightPack, start_idx: int) -> None:
+        weight_pack.weight_scale[
+            start_idx // self.block_size : start_idx + weight_scale.shape[0] // self.block_size
+        ].copy_(weight_scale)
+        return
+
+    def load_weight_zero_point(self, weight_zero_point: torch.Tensor, weight_pack: WeightPack, start_idx: int) -> None:
+        weight_pack.weight_zero_point[
+            start_idx // self.block_size : start_idx + weight_zero_point.shape[0] // self.block_size
+        ].copy_(weight_zero_point)
+        return
+
+
+def _deepgemm_fp8_nt(a_tuple, b_tuple, out):
+    if HAS_DEEPGEMM:
+        if hasattr(deep_gemm, "gemm_fp8_fp8_bf16_nt"):
+            return deep_gemm.gemm_fp8_fp8_bf16_nt([a_tuple[0], a_tuple[1]], [b_tuple[0], b_tuple[1]], out)
+        if hasattr(deep_gemm, "fp8_gemm_nt"):
+            return deep_gemm.fp8_gemm_nt((a_tuple[0], a_tuple[1]), (b_tuple[0], b_tuple[1]), out)
+    raise RuntimeError("deep_gemm does not provide fp8 NT GEMM kernel in this version")
diff --git a/lightllm/common/quantization/fp8_block128.py b/lightllm/common/quantization/fp8_block128.py
deleted file mode 100644
index 4144dddde..000000000
--- a/lightllm/common/quantization/fp8_block128.py
+++ /dev/null
@@ -1,216 +0,0 @@
-import torch
-from typing import Optional
-
-from lightllm.common.quantization.quantize_method import QuantizationMethod, WeightPack
-from lightllm.common.quantization.registry import QUANTMETHODS
-from lightllm.common.quantization.backend import QUANT_BACKEND, BackendType
-from lightllm.common.quantization.triton_quant.fp8.fp8act_quant_kernel import per_token_group_quant_fp8
-from lightllm.common.quantization.triton_quant.fp8.fp8w8a8_block_gemm_kernel import w8a8_block_fp8_matmul
-from lightllm.utils.log_utils import init_logger
-
-logger = init_logger(__name__)
-
-try:
-    import deep_gemm
-
-    HAS_DEEPGEMM = True
-except ImportError:
-    HAS_DEEPGEMM = False
-
-try:
-    from lightllm.utils.vllm_utils import HAS_VLLM
-
-    if HAS_VLLM:
-        from lightllm.utils.vllm_utils import cutlass_scaled_mm
-    else:
-        cutlass_scaled_mm = None
-except ImportError:
-    HAS_VLLM = False
-    cutlass_scaled_mm = None
-
-
-def _deepgemm_fp8_nt(a_tuple, b_tuple, out):
-    if hasattr(deep_gemm, "gemm_fp8_fp8_bf16_nt"):
-        return deep_gemm.gemm_fp8_fp8_bf16_nt([a_tuple[0], a_tuple[1]], [b_tuple[0], b_tuple[1]], out)
-    if hasattr(deep_gemm, "fp8_gemm_nt"):
-        return deep_gemm.fp8_gemm_nt((a_tuple[0], a_tuple[1]), (b_tuple[0], b_tuple[1]), out)
-    raise RuntimeError("deep_gemm does not provide fp8 NT GEMM kernel in this version")
-
-
-@QUANTMETHODS.register(["fp8-block128"])
-class FP8Block128Quantization(QuantizationMethod):
-    def __init__(self):
-        super().__init__()
-        from lightllm.common.basemodel.layer_infer.cache_tensor_manager import g_cache_manager
-
-        self.cache_manager = g_cache_manager
-        self.block_size = 128
-        self.weight_scale_suffix = "weight_scale_inv"
-        self.has_weight_scale = True
-        self.has_weight_zero_point = False
-
-        self._backend = QUANT_BACKEND.get_backend("fp8-block128")
-        logger.info(f"FP8Block128Quantization using backend: {self._backend.name}")
-
-    @property
-    def method_name(self):
-        return "fp8-block128"
-
-    def quantize(self, weight: torch.Tensor, output: WeightPack, offset: int = 0) -> None:
-        from lightllm.common.quantization.triton_quant.fp8.fp8w8a8_block_quant_kernel import weight_quant
-
-        device = output.weight.device
-        weight, scale = weight_quant(weight.cuda(device), self.block_size)
-        output.weight[offset : offset + weight.shape[0], :].copy_(weight)
-        output.weight_scale[offset // self.block_size : offset + weight.shape[0] // self.block_size].copy_(scale)
-        return
-
-    def apply(
-        self,
-        input_tensor: torch.Tensor,
-        weight_pack: WeightPack,
-        out: Optional[torch.Tensor] = None,
-        workspace: Optional[torch.Tensor] = None,
-        use_custom_tensor_mananger: bool = True,
-        bias: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
-        alloc_func = torch.empty if not use_custom_tensor_mananger else self.cache_manager.empty
-        m, k = input_tensor.shape
-
-        if self._backend == BackendType.DEEPGEMM:
-            return self._apply_deepgemm(input_tensor, weight_pack, out, alloc_func, bias)
-        elif self._backend == BackendType.VLLM:
-            return self._apply_vllm(input_tensor, weight_pack, out, alloc_func, bias)
-        else:
-            return self._apply_triton(input_tensor, weight_pack, out, alloc_func, bias)
-
-    def _apply_deepgemm(
-        self,
-        input_tensor: torch.Tensor,
-        weight_pack: WeightPack,
-        out: Optional[torch.Tensor],
-        alloc_func,
-        bias: Optional[torch.Tensor],
-    ) -> torch.Tensor:
-        qweight = weight_pack.weight
-        weight_scale = weight_pack.weight_scale
-        m, k = input_tensor.shape
-        n = qweight.shape[0]
-
-        qinput_tensor, input_scale = per_token_group_quant_fp8(
-            input_tensor,
-            self.block_size,
-            dtype=qweight.dtype,
-            column_major_scales=True,
-            scale_tma_aligned=True,
-            alloc_func=alloc_func,
-        )
-
-        if out is None:
-            out = alloc_func((m, n), dtype=input_tensor.dtype, device=input_tensor.device)
-
-        _deepgemm_fp8_nt((qinput_tensor, input_scale), (qweight, weight_scale), out)
-
-        if bias is not None:
-            out.add_(bias)
-        return out
-
-    def _apply_vllm(
-        self,
-        input_tensor: torch.Tensor,
-        weight_pack: WeightPack,
-        out: Optional[torch.Tensor],
-        alloc_func,
-        bias: Optional[torch.Tensor],
-    ) -> torch.Tensor:
-        qweight = weight_pack.weight.t()
-        weight_scale = weight_pack.weight_scale.t()
-        m, k = input_tensor.shape
-        n = qweight.shape[1]
-
-        qinput_tensor, input_scale = per_token_group_quant_fp8(
-            input_tensor, self.block_size, dtype=qweight.dtype, alloc_func=alloc_func
-        )
-
-        if out is None:
-            out = alloc_func((m, n), dtype=input_tensor.dtype, device=input_tensor.device)
-
-        if n % 128 != 0:
-            w8a8_block_fp8_matmul(
-                qinput_tensor,
-                qweight,
-                input_scale,
-                weight_scale,
-                out,
-                (self.block_size, self.block_size),
-                dtype=input_tensor.dtype,
-            )
-        else:
-            input_scale = input_scale.t().contiguous().t()
-            cutlass_scaled_mm(out, qinput_tensor, qweight, input_scale, weight_scale, bias)
-            return out
-
-        if bias is not None:
-            out.add_(bias)
-        return out
-
-    def _apply_triton(
-        self,
-        input_tensor: torch.Tensor,
-        weight_pack: WeightPack,
-        out: Optional[torch.Tensor],
-        alloc_func,
-        bias: Optional[torch.Tensor],
-    ) -> torch.Tensor:
-        qweight = weight_pack.weight
-        weight_scale = weight_pack.weight_scale
-        m, k = input_tensor.shape
-        n = qweight.shape[1]
-
-        qinput_tensor, input_scale = per_token_group_quant_fp8(
-            input_tensor, self.block_size, dtype=qweight.dtype, alloc_func=alloc_func
-        )
-
-        if out is None:
-            out = alloc_func((m, n), dtype=input_tensor.dtype, device=input_tensor.device)
-
-        w8a8_block_fp8_matmul(
-            qinput_tensor,
-            qweight,
-            input_scale,
-            weight_scale,
-            out,
-            (self.block_size, self.block_size),
-            dtype=input_tensor.dtype,
-        )
-
-        if bias is not None:
-            out.add_(bias)
-        return out
-
-    def create_weight(
-        self, out_dim: int, in_dim: int, dtype: torch.dtype, device_id: int, num_experts: int = 1
-    ) -> WeightPack:
-        expert_prefix = (num_experts,) if num_experts > 1 else ()
-        weight = torch.empty(expert_prefix + (out_dim, in_dim), dtype=torch.float8_e4m3fn).cuda(device_id)
-        weight_scale = torch.empty(
-            expert_prefix + (out_dim // self.block_size, in_dim // self.block_size), dtype=torch.float32
-        ).cuda(device_id)
-        return WeightPack(weight=weight, weight_scale=weight_scale)
-
-    def load_weight(self, weight: torch.Tensor, weight_pack: WeightPack, start_idx: int) -> None:
-        weight_pack.weight[start_idx : start_idx + weight.shape[0]].copy_(weight)
-        return
-
-    def load_weight_scale(self, weight_scale: torch.Tensor, weight_pack: WeightPack, start_idx: int) -> None:
-        weight_pack.weight_scale[
-            start_idx // self.block_size : start_idx + weight_scale.shape[0] // self.block_size
-        ].copy_(weight_scale)
-        return
-
-    def load_weight_zero_point(self, weight_zero_point: torch.Tensor, weight_pack: WeightPack, start_idx: int) -> None:
-        if weight_pack.weight_zero_point is not None:
-            weight_pack.weight_zero_point[
-                start_idx // self.block_size : start_idx + weight_zero_point.shape[0] // self.block_size
-            ].copy_(weight_zero_point)
-        return
diff --git a/lightllm/common/quantization/fp8_per_token.py b/lightllm/common/quantization/fp8_per_token.py
deleted file mode 100644
index ce7f9342c..000000000
--- a/lightllm/common/quantization/fp8_per_token.py
+++ /dev/null
@@ -1,172 +0,0 @@
-import torch
-from typing import Optional
-
-from lightllm.common.quantization.quantize_method import QuantizationMethod, WeightPack
-from lightllm.common.quantization.registry import QUANTMETHODS
-from lightllm.common.quantization.backend import QUANT_BACKEND, BackendType
-from lightllm.common.basemodel.triton_kernel.quantization.scaled_mm_per_token_kernel import fp8_scaled_mm_per_token
-from lightllm.utils.log_utils import init_logger
-
-logger = init_logger(__name__)
-
-try:
-    from lightllm.utils.vllm_utils import HAS_VLLM
-
-    if HAS_VLLM:
-        from lightllm.utils.vllm_utils import vllm_ops, cutlass_scaled_mm
-    else:
-        vllm_ops = None
-        cutlass_scaled_mm = None
-except ImportError:
-    HAS_VLLM = False
-    vllm_ops = None
-    cutlass_scaled_mm = None
-
-try:
-    from lightllm.utils.light_utils import HAS_LIGHTLLM_KERNEL, light_ops
-
-    if HAS_LIGHTLLM_KERNEL:
-
-        def scaled_fp8_quant(tensor, *args, **kwargs):
-            return light_ops.per_token_quant_bf16_fp8(tensor)
-
-    else:
-        if HAS_VLLM:
-            scaled_fp8_quant = vllm_ops.scaled_fp8_quant
-        else:
-            scaled_fp8_quant = None
-except ImportError:
-    HAS_LIGHTLLM_KERNEL = False
-    if HAS_VLLM:
-        scaled_fp8_quant = vllm_ops.scaled_fp8_quant
-    else:
-        scaled_fp8_quant = None
-
-
-@QUANTMETHODS.register(["fp8-per-token", "fp8w8a8"])
-class FP8PerTokenQuantization(QuantizationMethod):
-    def __init__(self):
-        super().__init__()
-        from lightllm.common.basemodel.layer_infer.cache_tensor_manager import g_cache_manager
-
-        self.cache_manager = g_cache_manager
-        self.is_moe = False
-        self.has_weight_scale = True
-        self.has_weight_zero_point = False
-        self._backend = QUANT_BACKEND.get_backend("fp8-per-token")
-        logger.info(f"FP8PerTokenQuantization using backend: {self._backend.name}")
-
-    @property
-    def method_name(self):
-        return "fp8-per-token"
-
-    def quantize(self, weight: torch.Tensor, output: WeightPack, offset: int = 0) -> None:
-        """Quantize weights using per-token FP8 quantization."""
-        if self.is_moe:
-            return self._quantize_moe(weight, output, offset)
-
-        if scaled_fp8_quant is None:
-            raise RuntimeError("No FP8 quantization kernel available. Install vLLM or lightllm-kernel.")
-
-        qweight, weight_scale = scaled_fp8_quant(
-            weight.cuda(self.device_id_), scale=None, use_per_token_if_dynamic=True
-        )
-        output.weight[offset : offset + qweight.shape[0], :].copy_(qweight)
-        output.weight_scale[offset : offset + weight_scale.shape[0]].copy_(weight_scale.view(-1))
-        return
-
-    def _quantize_moe(self, weight: torch.Tensor, output: WeightPack, offset: int) -> None:
-        if scaled_fp8_quant is None:
-            raise RuntimeError("No FP8 quantization kernel available. Install vLLM or lightllm-kernel.")
-
-        num_experts = weight.shape[0]
-        qweights = torch.empty_like(weight, dtype=torch.float8_e4m3fn).cuda(self.device_id_)
-        weight_scales = []
-        for i in range(num_experts):
-            qweight, weight_scale = scaled_fp8_quant(
-                weight[i].contiguous().cuda(self.device_id_), scale=None, use_per_token_if_dynamic=True
-            )
-            qweights[i] = qweight
-            weight_scales.append(weight_scale)
-        weight_scale = torch.stack(weight_scales, dim=0).contiguous()
-        output.weight.copy_(qweights)
-        output.weight_scale.copy_(weight_scale)
-        return
-
-    def apply(
-        self,
-        input_tensor: torch.Tensor,
-        weight_pack: WeightPack,
-        out: Optional[torch.Tensor] = None,
-        workspace: Optional[torch.Tensor] = None,
-        use_custom_tensor_mananger: bool = True,
-        bias: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
-        if self._backend == BackendType.TRITON:
-            return self._apply_triton(input_tensor, weight_pack, out, use_custom_tensor_mananger, bias)
-        else:
-            return self._apply_vllm(input_tensor, weight_pack, out, use_custom_tensor_mananger, bias)
-
-    def _apply_vllm(
-        self,
-        input_tensor: torch.Tensor,
-        weight_pack: WeightPack,
-        out: Optional[torch.Tensor],
-        use_custom_tensor_mananger: bool,
-        bias: Optional[torch.Tensor],
-    ) -> torch.Tensor:
-        qweight = weight_pack.weight.t()
-        weight_scale = weight_pack.weight_scale
-
-        x_q, x_scale = scaled_fp8_quant(input_tensor, scale=None, scale_ub=None, use_per_token_if_dynamic=True)
-
-        m = input_tensor.shape[0]
-        n = qweight.shape[1]
-
-        if out is None:
-            if use_custom_tensor_mananger:
-                out = self.cache_manager.alloc_tensor((m, n), input_tensor.dtype, device=input_tensor.device)
-            else:
-                out = torch.empty((m, n), dtype=input_tensor.dtype, device=input_tensor.device)
-
-        cutlass_scaled_mm(out, x_q, qweight, x_scale, weight_scale, bias)
-        return out
-
-    def _apply_triton(
-        self,
-        input_tensor: torch.Tensor,
-        weight_pack: WeightPack,
-        out: Optional[torch.Tensor],
-        use_custom_tensor_mananger: bool,
-        bias: Optional[torch.Tensor],
-    ) -> torch.Tensor:
-        qweight = weight_pack.weight.t()
-        weight_scale = weight_pack.weight_scale
-
-        if scaled_fp8_quant is None:
-            raise RuntimeError("No FP8 quantization kernel available. Install vLLM or lightllm-kernel.")
-
-        x_q, x_scale = scaled_fp8_quant(input_tensor, scale=None, scale_ub=None, use_per_token_if_dynamic=True)
-
-        m = input_tensor.shape[0]
-        n = qweight.shape[1]
-
-        if out is None:
-            if use_custom_tensor_mananger:
-                out = self.cache_manager.alloc_tensor((m, n), input_tensor.dtype, device=input_tensor.device)
-            else:
-                out = torch.empty((m, n), dtype=input_tensor.dtype, device=input_tensor.device)
-
-        out = fp8_scaled_mm_per_token(x_q, qweight, x_scale, weight_scale, input_tensor.dtype, out)
-
-        if bias is not None:
-            out.add_(bias)
-        return out
-
-    def create_weight(
-        self, out_dim: int, in_dim: int, dtype: torch.dtype, device_id: int, num_experts: int = 1
-    ) -> WeightPack:
-        expert_prefix = (num_experts,) if num_experts > 1 else ()
-        weight = torch.empty(expert_prefix + (out_dim, in_dim), dtype=torch.float8_e4m3fn).cuda(device_id)
-        weight_scale = torch.empty(expert_prefix + (out_dim,), dtype=torch.float32).cuda(device_id)
-        return WeightPack(weight=weight, weight_scale=weight_scale)
diff --git a/lightllm/common/quantization/no_quant.py b/lightllm/common/quantization/no_quant.py
index e92d821c1..c05c90b21 100644
--- a/lightllm/common/quantization/no_quant.py
+++ b/lightllm/common/quantization/no_quant.py
@@ -5,7 +5,8 @@
 from lightllm.common.quantization.registry import QUANTMETHODS
 
 
-@QUANTMETHODS.register("none")
+@QUANTMETHODS.register("none", platform="musa")
+@QUANTMETHODS.register("none", platform="cuda")
 class NoQuantization(QuantizationMethod):
     """No quantization - uses full precision weights."""
 
diff --git a/lightllm/common/quantization/registry.py b/lightllm/common/quantization/registry.py
index e9b407398..c9baa64e2 100644
--- a/lightllm/common/quantization/registry.py
+++ b/lightllm/common/quantization/registry.py
@@ -5,21 +5,27 @@ class QuantMethodFactory:
     def __init__(self):
         self._quant_methods = {}
 
-    def register(self, names):
+    def register(self, names, platform="cuda"):
         def decorator(cls):
             local_names = names
             if isinstance(local_names, str):
                 local_names = [local_names]
             for n in local_names:
-                self._quant_methods[n] = cls
+                if n not in self._quant_methods:
+                    self._quant_methods[n] = {}
+                self._quant_methods[n][platform] = cls
             return cls
 
         return decorator
 
-    def get(self, key, *args, **kwargs) -> "QuantizationMethod":
-        quant_method_class = self._quant_methods.get(key)
-        if not quant_method_class:
+    def get(self, key, platform="cuda", *args, **kwargs) -> "QuantizationMethod":
+        quant_method_class_dict = self._quant_methods.get(key)
+        if not quant_method_class_dict:
             raise ValueError(f"QuantMethod '{key}' not supported.")
+
+        quant_method_class = quant_method_class_dict.get(platform)
+        if quant_method_class is None:
+            raise ValueError(f"QuantMethod '{key}' for platform '{platform}' not supported.")
         return quant_method_class()
 
 
diff --git a/lightllm/common/quantization/w8a8.py b/lightllm/common/quantization/w8a8.py
index 721807356..0a74d9887 100644
--- a/lightllm/common/quantization/w8a8.py
+++ b/lightllm/common/quantization/w8a8.py
@@ -1,74 +1,72 @@
+import os
 import torch
+import torch.nn.functional as F
 from typing import Optional
+from .quantize_method import QuantizationMethod
+from .registry import QUANTMETHODS
+from lightllm.common.basemodel.triton_kernel.quantization.scaled_mm_per_token_kernel import fp8_scaled_mm_per_token
+from lightllm.common.basemodel.triton_kernel.quantization.fp8act_quant_kernel import per_token_group_quant_fp8
+from lightllm.common.basemodel.triton_kernel.quantization.fp8w8a8_block_gemm_kernel import w8a8_block_fp8_matmul
+from lightllm.utils.vllm_utils import HAS_VLLM, vllm_ops, cutlass_scaled_mm
+from lightllm.utils.light_utils import HAS_LIGHTLLM_KERNEL, light_ops
 
-from lightllm.common.quantization.quantize_method import QuantizationMethod, WeightPack
-from lightllm.common.quantization.registry import QUANTMETHODS
-from lightllm.common.basemodel.layer_weights.meta_weights.platform_op import PlatformAwareOp
-from lightllm.common.basemodel.triton_kernel.quantization.scaled_mm_per_token_kernel import (
-    fp8_scaled_mm_per_token,
-    int8_scaled_mm_per_token,
-)
-from lightllm.utils.log_utils import init_logger
 
-logger = init_logger(__name__)
+from .quantize_method import WeightPack
 
-# Conditional imports for optional backends
-try:
-    from lightllm.utils.vllm_utils import HAS_VLLM
+if HAS_LIGHTLLM_KERNEL:
 
-    if HAS_VLLM:
-        from lightllm.utils.vllm_utils import vllm_ops, cutlass_scaled_mm
-    else:
-        vllm_ops = None
-        cutlass_scaled_mm = None
-except ImportError:
-    HAS_VLLM = False
-    vllm_ops = None
-    cutlass_scaled_mm = None
-
-
-try:
-    from lightllm.utils.light_utils import HAS_LIGHTLLM_KERNEL, light_ops
-
-    if HAS_LIGHTLLM_KERNEL:
-
-        def scaled_fp8_quant(tensor, *args, **kwargs):
-            return light_ops.per_token_quant_bf16_fp8(tensor)
+    def scaled_fp8_quant(tensor, *args, **kwargs):
+        return light_ops.per_token_quant_bf16_fp8(tensor)
 
-    else:
-        if HAS_VLLM:
-            scaled_fp8_quant = vllm_ops.scaled_fp8_quant
-        else:
-            scaled_fp8_quant = None
-except ImportError:
-    HAS_LIGHTLLM_KERNEL = False
+else:
     if HAS_VLLM:
         scaled_fp8_quant = vllm_ops.scaled_fp8_quant
-    else:
-        scaled_fp8_quant = None
+
+LIGHTLLM_USE_TRITON_FP8_SCALED_MM = os.getenv("LIGHTLLM_USE_TRITON_FP8_SCALED_MM", "False").upper() in [
+    "ON",
+    "TRUE",
+    "1",
+]
 
 
-@QUANTMETHODS.register(["w8a8", "vllm-w8a8"])
-class W8A8Quantization(QuantizationMethod, PlatformAwareOp):
+class BaseQuantizationMethod(QuantizationMethod):
     def __init__(self):
         super().__init__()
+        assert HAS_VLLM, "vllm are not installed, you can't use quant api of them."
         from lightllm.common.basemodel.layer_infer.cache_tensor_manager import g_cache_manager
 
         self.cache_manager = g_cache_manager
-        self.has_weight_scale = True
-        self.has_weight_zero_point = False
+
+    def quantize(self, weight: torch.Tensor, output: WeightPack, offset: int = 0) -> None:
+        raise NotImplementedError("Not implemented")
+
+    def apply(
+        self,
+        input_tensor: torch.Tensor,
+        weight_pack: WeightPack,
+        out: Optional[torch.Tensor] = None,
+        workspace: Optional[torch.Tensor] = None,
+        use_custom_tensor_mananger: bool = True,
+        bias: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        raise NotImplementedError("Not implemented")
 
     @property
     def method_name(self):
-        return "w8a8"
+        return "w8a8-base"
 
     def create_weight(
         self, out_dim: int, in_dim: int, dtype: torch.dtype, device_id: int, num_experts: int = 1
     ) -> WeightPack:
-        expert_prefix = (num_experts,) if num_experts > 1 else ()
-        weight = torch.empty(expert_prefix + (out_dim, in_dim), dtype=torch.int8).cuda(device_id)
-        weight_scale = torch.empty(expert_prefix + (out_dim,), dtype=torch.float32).cuda(device_id)
-        return WeightPack(weight=weight, weight_scale=weight_scale)
+        raise NotImplementedError("Not implemented")
+
+
+@QUANTMETHODS.register(["vllm-w8a8", "w8a8"], platform="cuda")
+class w8a8QuantizationMethod(BaseQuantizationMethod):
+    def __init__(self):
+        super().__init__()
+        self.has_weight_scale = True
+        self.has_weight_zero_point = False
 
     def quantize(self, weight: torch.Tensor, output: WeightPack, offset: int = 0) -> None:
         weight = weight.float().cuda(self.device_id_)
@@ -88,90 +86,45 @@ def apply(
         use_custom_tensor_mananger: bool = True,
         bias: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
-        return self._forward(
-            input_tensor=input_tensor,
-            weight_pack=weight_pack,
-            out=out,
-            workspace=workspace,
-            use_custom_tensor_mananger=use_custom_tensor_mananger,
-            bias=bias,
-        )
-
-    def _triton_forward(
-        self,
-        input_tensor: torch.Tensor,
-        weight_pack: WeightPack,
-        out: Optional[torch.Tensor],
-        workspace: Optional[torch.Tensor],
-        use_custom_tensor_mananger: bool,
-        bias: Optional[torch.Tensor],
-    ) -> torch.Tensor:
-
+        input_scale = None
         qweight = weight_pack.weight.t()
         weight_scale = weight_pack.weight_scale
-
-        # TODO: support fp8 quantization triton
-
-        x_q, x_scale = scaled_fp8_quant(input_tensor, scale=None, scale_ub=None, use_per_token_if_dynamic=True)
-
+        input_scale = None  # dynamic quantization for input tensor
+        x_q, x_scale, x_zp = vllm_ops.scaled_int8_quant(input_tensor, scale=input_scale, azp=None, symmetric=True)
         m = input_tensor.shape[0]
         n = qweight.shape[1]
-
         if out is None:
             if use_custom_tensor_mananger:
                 out = self.cache_manager.alloc_tensor((m, n), input_tensor.dtype, device=input_tensor.device)
             else:
                 out = torch.empty((m, n), dtype=input_tensor.dtype, device=input_tensor.device)
-
-        out = int8_scaled_mm_per_token(x_q, qweight, x_scale, weight_scale, input_tensor.dtype, out)
-
-        if bias is not None:
-            out.add_(bias)
+        cutlass_scaled_mm(out, x_q, qweight, x_scale, weight_scale, bias)
         return out
 
-    def _cuda_forward(
-        self,
-        input_tensor: torch.Tensor,
-        weight_pack: WeightPack,
-        out: Optional[torch.Tensor],
-        workspace: Optional[torch.Tensor],
-        use_custom_tensor_mananger: bool,
-        bias: Optional[torch.Tensor],
-    ) -> torch.Tensor:
-        qweight = weight_pack.weight.t()
-        weight_scale = weight_pack.weight_scale
-
-        x_q, x_scale, x_zp = vllm_ops.scaled_int8_quant(input_tensor, scale=None, azp=None, symmetric=True)
-
-        m = input_tensor.shape[0]
-        n = qweight.shape[1]
-
-        if out is None:
-            if use_custom_tensor_mananger:
-                out = self.cache_manager.alloc_tensor((m, n), input_tensor.dtype, device=input_tensor.device)
-            else:
-                out = torch.empty((m, n), dtype=input_tensor.dtype, device=input_tensor.device)
+    @property
+    def method_name(self):
+        return "vllm-w8a8"
 
-        cutlass_scaled_mm(out, x_q, qweight, x_scale, weight_scale, bias)
-        return out
+    def create_weight(
+        self, out_dim: int, in_dim: int, dtype: torch.dtype, device_id: int, num_experts: int = 1
+    ) -> WeightPack:
+        expert_prefix = (num_experts,) if num_experts > 1 else ()
+        weight = torch.empty(expert_prefix + (out_dim, in_dim), dtype=torch.int8).cuda(device_id)
+        weight_scale = torch.empty(expert_prefix + (out_dim,), dtype=torch.float32).cuda(device_id)
+        return WeightPack(weight=weight, weight_scale=weight_scale)
 
 
-class Fp8W8A8Quantization(QuantizationMethod, PlatformAwareOp):
+@QUANTMETHODS.register(["vllm-fp8w8a8", "fp8w8a8"], platform="cuda")
+class FP8w8a8QuantizationMethod(BaseQuantizationMethod):
     def __init__(self):
         super().__init__()
-        from lightllm.common.basemodel.layer_infer.cache_tensor_manager import g_cache_manager
-
-        self.cache_manager = g_cache_manager
         self.is_moe = False
         self.has_weight_scale = True
         self.has_weight_zero_point = False
 
-    @property
-    def method_name(self):
-        return "f8w8a8"
-
     def quantize(self, weight: torch.Tensor, output: WeightPack, offset: int = 0) -> None:
-        """Quantize weights using per-token FP8 quantization."""
+        if self.is_moe:
+            return self.quantize_moe(weight, output, offset)
         qweight, weight_scale = scaled_fp8_quant(
             weight.cuda(self.device_id_), scale=None, use_per_token_if_dynamic=True
         )
@@ -179,18 +132,18 @@ def quantize(self, weight: torch.Tensor, output: WeightPack, offset: int = 0) ->
         output.weight_scale[offset : offset + weight_scale.shape[0]].copy_(weight_scale.view(-1))
         return
 
-    def create_weight(
-        self, out_dim: int, in_dim: int, dtype: torch.dtype, device_id: int, num_experts: int = 1
-    ) -> WeightPack:
-        expert_prefix = (num_experts,) if num_experts > 1 else ()
-        weight = torch.empty(expert_prefix + (out_dim, in_dim), dtype=torch.float8_e4m3fn).cuda(device_id)
-        if self.is_moe:
-            assert num_experts > 1, "Number of experts must be greater than 1 for MOE"
-            # per-tensor weight quantization for moe
-            weight_scale = torch.empty((num_experts,), dtype=torch.float32).cuda(device_id)
-        else:
-            weight_scale = torch.empty(expert_prefix + (out_dim,), dtype=torch.float32).cuda(device_id)
-        return WeightPack(weight=weight, weight_scale=weight_scale)
+    def quantize_moe(self, weight: torch.Tensor) -> WeightPack:
+        num_experts = weight.shape[0]
+        qweights = torch.empty_like(weight, dtype=torch.float8_e4m3fn).cuda(self.device_id_)
+        weight_scales = []
+        for i in range(num_experts):
+            qweight, weight_scale = scaled_fp8_quant(
+                weight[i].contiguous().cuda(self.device_id_), scale=None, use_per_token_if_dynamic=True
+            )
+            qweights[i] = qweight
+            weight_scales.append(weight_scale)
+        weight_scale = torch.stack(weight_scales, dim=0).contiguous()
+        return WeightPack(weight=qweights, weight_scale=weight_scale)
 
     def apply(
         self,
@@ -200,60 +153,100 @@ def apply(
         workspace: Optional[torch.Tensor] = None,
         use_custom_tensor_mananger: bool = True,
         bias: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
-        return self._forward(input_tensor, weight_pack, out, use_custom_tensor_mananger, bias)
-
-    def _cuda_forward(
-        self,
-        input_tensor: torch.Tensor,
-        weight_pack: WeightPack,
-        out: Optional[torch.Tensor],
-        use_custom_tensor_mananger: bool,
-        bias: Optional[torch.Tensor],
     ) -> torch.Tensor:
         qweight = weight_pack.weight.t()
         weight_scale = weight_pack.weight_scale
-
         x_q, x_scale = scaled_fp8_quant(input_tensor, scale=None, scale_ub=None, use_per_token_if_dynamic=True)
-
         m = input_tensor.shape[0]
         n = qweight.shape[1]
-
         if out is None:
             if use_custom_tensor_mananger:
                 out = self.cache_manager.alloc_tensor((m, n), input_tensor.dtype, device=input_tensor.device)
             else:
                 out = torch.empty((m, n), dtype=input_tensor.dtype, device=input_tensor.device)
-
-        cutlass_scaled_mm(out, x_q, qweight, x_scale, weight_scale, bias)
+        if LIGHTLLM_USE_TRITON_FP8_SCALED_MM:
+            out = fp8_scaled_mm_per_token(x_q, qweight, x_scale, weight_scale, input_tensor.dtype, out)
+        else:
+            cutlass_scaled_mm(out, x_q, qweight, x_scale, weight_scale, bias)
         return out
 
-    def _apply_triton(
+    @property
+    def method_name(self):
+        return "vllm-fp8w8a8"
+
+    def create_weight(
+        self, out_dim: int, in_dim: int, dtype: torch.dtype, device_id: int, num_experts: int = 1
+    ) -> WeightPack:
+        expert_prefix = (num_experts,) if num_experts > 1 else ()
+        weight = torch.empty(expert_prefix + (out_dim, in_dim), dtype=torch.float8_e4m3fn).cuda(device_id)
+        weight_scale = torch.empty(expert_prefix + (out_dim,), dtype=torch.float32).cuda(device_id)
+        return WeightPack(weight=weight, weight_scale=weight_scale)
+
+
+@QUANTMETHODS.register(["vllm-fp8w8a8-b128", "fp8w8a8-b128"], platform="cuda")
+class FP8w8a8B128QuantizationMethod(BaseQuantizationMethod):
+    def __init__(self):
+        super().__init__()
+        self.block_size = 128
+        self.weight_scale_suffix = "weight_scale_inv"
+        self.has_weight_scale = True
+        self.has_weight_zero_point = False
+
+    def quantize(self, weight: torch.Tensor, output: WeightPack, offset: int = 0) -> None:
+        from lightllm.common.quantization.triton_quant.fp8.fp8w8a8_block_quant_kernel import weight_quant
+
+        device = output.weight.device
+        weight, scale = weight_quant(weight.cuda(device), self.block_size)
+        output.weight[offset : offset + weight.shape[0], :].copy_(weight)
+        output.weight_scale[offset // self.block_size : offset + weight.shape[0] // self.block_size].copy_(scale)
+        return
+
+    def apply(
         self,
         input_tensor: torch.Tensor,
         weight_pack: WeightPack,
-        out: Optional[torch.Tensor],
-        use_custom_tensor_mananger: bool,
-        bias: Optional[torch.Tensor],
+        out: Optional[torch.Tensor] = None,
+        workspace: Optional[torch.Tensor] = None,
+        use_custom_tensor_mananger: bool = True,
+        bias: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         qweight = weight_pack.weight.t()
-        weight_scale = weight_pack.weight_scale
-
-        # TODO: support fp8 quantization triton
-
-        x_q, x_scale = scaled_fp8_quant(input_tensor, scale=None, scale_ub=None, use_per_token_if_dynamic=True)
-
-        m = input_tensor.shape[0]
+        weight_scale = weight_pack.weight_scale.t()
+        input_scale = None  # dynamic quantization for input tensor
+        m, k = input_tensor.shape
         n = qweight.shape[1]
-
+        alloc_func = torch.empty if not use_custom_tensor_mananger else self.cache_manager.empty
+        if input_scale is None:
+            qinput_tensor, input_scale = per_token_group_quant_fp8(
+                input_tensor, self.block_size, dtype=qweight.dtype, alloc_func=alloc_func
+            )
         if out is None:
-            if use_custom_tensor_mananger:
-                out = self.cache_manager.alloc_tensor((m, n), input_tensor.dtype, device=input_tensor.device)
-            else:
-                out = torch.empty((m, n), dtype=input_tensor.dtype, device=input_tensor.device)
+            out = alloc_func((m, n), dtype=input_tensor.dtype, device=input_tensor.device)
+        if n % 128 != 0:
+            w8a8_block_fp8_matmul(
+                qinput_tensor,
+                qweight,
+                input_scale,
+                weight_scale,
+                out,
+                (self.block_size, self.block_size),
+                dtype=input_tensor.dtype,
+            )
+        else:
+            input_scale = input_scale.t().contiguous().t()
+            cutlass_scaled_mm(out, qinput_tensor, qweight, input_scale, weight_scale, bias)
+        return out
 
-        out = fp8_scaled_mm_per_token(x_q, qweight, x_scale, weight_scale, input_tensor.dtype, out)
+    @property
+    def method_name(self):
+        return "vllm-fp8w8a8-b128"
 
-        if bias is not None:
-            out.add_(bias)
-        return out
+    def create_weight(
+        self, out_dim: int, in_dim: int, dtype: torch.dtype, device_id: int, num_experts: int = 1
+    ) -> WeightPack:
+        expert_prefix = (num_experts,) if num_experts > 1 else ()
+        weight = torch.empty(expert_prefix + (out_dim, in_dim), dtype=torch.float8_e4m3fn).cuda(device_id)
+        weight_scale = torch.empty(
+            expert_prefix + (out_dim // self.block_size, in_dim // self.block_size), dtype=torch.float32
+        ).cuda(device_id)
+        return WeightPack(weight=weight, weight_scale=weight_scale)
diff --git a/lightllm/models/llama/layer_weights/pre_and_post_layer_weight.py b/lightllm/models/llama/layer_weights/pre_and_post_layer_weight.py
index 2e14eca26..cb540ee4d 100644
--- a/lightllm/models/llama/layer_weights/pre_and_post_layer_weight.py
+++ b/lightllm/models/llama/layer_weights/pre_and_post_layer_weight.py
@@ -15,23 +15,13 @@ def __init__(self, data_type, network_config):
             data_type=self.data_type_,
         )
         tie_word_embeddings = self.network_config_.get("tie_word_embeddings", False)
-        if tie_word_embeddings:
-            # Share weight with EmbeddingWeight to save memory
-            self.lm_head_weight_ = LMHeadWeight(
-                dim=hidden_size,
-                vocab_size=vocab_size,
-                weight_name="model.embed_tokens.weight",
-                data_type=self.data_type_,
-                shared_weight=self.wte_weight_,
-            )
-        else:
-            self.lm_head_weight_ = LMHeadWeight(
-                dim=hidden_size,
-                vocab_size=vocab_size,
-                weight_name="lm_head.weight",
-                data_type=self.data_type_,
-            )
-
+        self.lm_head_weight_ = LMHeadWeight(
+            dim=hidden_size,
+            vocab_size=vocab_size,
+            weight_name="model.embed_tokens.weight",
+            data_type=self.data_type_,
+            embedding_weight=self.wte_weight_ if tie_word_embeddings else None,
+        )
         self.final_norm_weight_ = RMSNormWeight(
             dim=hidden_size,
             weight_name="model.norm.weight",
diff --git a/lightllm/models/qwen3_moe/layer_infer/transformer_layer_infer.py b/lightllm/models/qwen3_moe/layer_infer/transformer_layer_infer.py
index d273d51ad..52f9289eb 100644
--- a/lightllm/models/qwen3_moe/layer_infer/transformer_layer_infer.py
+++ b/lightllm/models/qwen3_moe/layer_infer/transformer_layer_infer.py
@@ -91,15 +91,13 @@ def _tpsp_get_qkv(
 
         input = input.view(-1, self.embed_dim_)
         q = layer_weight.q_proj.mm(input)
-        cache_kv = layer_weight.kv_proj.mm(input).view(-1, (self.tp_k_head_num_ + self.tp_v_head_num_), self.head_dim_)
-
-        layer_weight.q_norm_weight_(q.view(-1, self.head_dim_), eps=self.eps_, out=q.view(-1, self.head_dim_))
-
-        cache_kv[:, : self.tp_k_head_num_, :] = layer_weight.k_norm_weight_(
-            cache_kv[:, : self.tp_k_head_num_, :].reshape(-1, cache_kv.shape[-1]),
+        cache_kv = layer_weight.kv_proj.mm(input)
+        layer_weight.q_norm_weight_(q, eps=self.eps_)
+        layer_weight.k_norm_weight_(
+            cache_kv[:, : self.tp_k_head_num_ * self.head_dim_],
             eps=self.eps_,
-            alloc_func=self.alloc_tensor,
-        ).view(-1, self.tp_k_head_num_, cache_kv.shape[-1])
+        )
+        cache_kv = cache_kv.view(-1, (self.tp_k_head_num_ + self.tp_v_head_num_), self.head_dim_)
 
         rotary_emb_fwd(
             q.view(-1, self.tp_q_head_num_, self.head_dim_),
diff --git a/lightllm/models/qwen3_vl_moe/layer_weights/pre_and_post_layer_weight.py b/lightllm/models/qwen3_vl_moe/layer_weights/pre_and_post_layer_weight.py
index 475bcee95..b3fe64ce5 100644
--- a/lightllm/models/qwen3_vl_moe/layer_weights/pre_and_post_layer_weight.py
+++ b/lightllm/models/qwen3_vl_moe/layer_weights/pre_and_post_layer_weight.py
@@ -14,22 +14,13 @@ def __init__(self, data_type, network_config):
             data_type=self.data_type_,
         )
         tie_word_embeddings = self.network_config_.get("tie_word_embeddings", False)
-        if tie_word_embeddings:
-            # Share weight with EmbeddingWeight to save memory
-            self.lm_head_weight_ = LMHeadWeight(
-                dim=hidden_size,
-                vocab_size=vocab_size,
-                weight_name="model.language_model.embed_tokens.weight",
-                data_type=self.data_type_,
-                shared_weight=self.wte_weight_,
-            )
-        else:
-            self.lm_head_weight_ = LMHeadWeight(
-                dim=hidden_size,
-                vocab_size=vocab_size,
-                weight_name="lm_head.weight",
-                data_type=self.data_type_,
-            )
+        self.lm_head_weight_ = LMHeadWeight(
+            dim=hidden_size,
+            vocab_size=vocab_size,
+            weight_name="model.language_model.embed_tokens.weight",
+            data_type=self.data_type_,
+            embedding_weight=self.wte_weight_ if tie_word_embeddings else None,
+        )
         self.final_norm_weight_ = RMSNormWeight(
             dim=hidden_size,
             weight_name="model.language_model.norm.weight",
diff --git a/lightllm/models/starcoder2/layer_weights/pre_and_post_layer_weight.py b/lightllm/models/starcoder2/layer_weights/pre_and_post_layer_weight.py
index e6d5cb441..2071b52cd 100644
--- a/lightllm/models/starcoder2/layer_weights/pre_and_post_layer_weight.py
+++ b/lightllm/models/starcoder2/layer_weights/pre_and_post_layer_weight.py
@@ -14,22 +14,13 @@ def __init__(self, data_type, network_config):
             data_type=self.data_type_,
         )
         tie_word_embeddings = self.network_config_.get("tie_word_embeddings", False)
-        if tie_word_embeddings:
-            # Share weight with EmbeddingWeight to save memory
-            self.lm_head_weight_ = LMHeadWeight(
-                dim=hidden_size,
-                vocab_size=vocab_size,
-                weight_name="model.embed_tokens.weight",
-                data_type=self.data_type_,
-                shared_weight=self.wte_weight_,
-            )
-        else:
-            self.lm_head_weight_ = LMHeadWeight(
-                dim=hidden_size,
-                vocab_size=vocab_size,
-                weight_name="lm_head.weight",
-                data_type=self.data_type_,
-            )
+        self.lm_head_weight_ = LMHeadWeight(
+            dim=hidden_size,
+            vocab_size=vocab_size,
+            weight_name="model.embed_tokens.weight",
+            data_type=self.data_type_,
+            embedding_weight=self.wte_weight_ if tie_word_embeddings else None,
+        )
 
         self.final_norm_weight_ = LayerNormWeight(
             dim=hidden_size,

From 72c8f1701d5f700acd1a2be523864b783211929f Mon Sep 17 00:00:00 2001
From: shihaobai <1798930569@qq.com>
Date: Mon, 19 Jan 2026 14:00:04 +0000
Subject: [PATCH 21/65] update docs

---
 docs/CN/source/models/add_new_model.md | 37 --------------------------
 docs/EN/source/models/add_new_model.md | 36 -------------------------
 2 files changed, 73 deletions(-)

diff --git a/docs/CN/source/models/add_new_model.md b/docs/CN/source/models/add_new_model.md
index 49b47ffa2..5d34cf747 100755
--- a/docs/CN/source/models/add_new_model.md
+++ b/docs/CN/source/models/add_new_model.md
@@ -162,19 +162,6 @@ class BloomPreAndPostLayerWeight(PreAndPostLayerWeight):
                                                                  self.tp_rank_: split_vob_size * (self.tp_rank_ + 1), :])
             self.lm_head_weight_ = self.wte_weight_
         return
-    
-    def verify_load(self):
-        errors = "weights load not ok"
-        weights = [self.pre_norm_weight_, 
-                   self.pre_norm_bias_, 
-                   self.final_norm_weight_, 
-                   self.final_norm_bias_,
-                   self.wte_weight_,
-                   self.lm_head_weight_]
-        for i in range(len(weights)):
-            assert weights[i] is not None, "index:" + str(i) + " " + errors
-        return 
-
 ~~~
 
 ***transformer_layer_weight.py***
@@ -204,30 +191,6 @@ class BloomTransformerLayerWeight(TransformerLayerWeight):
         self._load_qkvo_weights(weights)
         self._load_ffn_weights(weights)
         return
-    
-    def verify_load(self):
-        errors = "weights load not ok"
-        weights = [self.att_norm_weight_,
-                   self.att_norm_bias_,
-                   self.q_weight_,
-                   self.k_weight_,
-                   self.v_weight_,
-                   self.q_bias_,
-                   self.k_bias_,
-                   self.v_bias_,
-                   self.o_weight_,
-                   self.o_bias_,
-
-                   self.ffn_norm_weight_,
-                   self.ffn_norm_bias_,
-                   self.ffn_1_weight_,
-                   self.ffn_1_bias_,
-                   self.ffn_2_weight_,
-                   self.ffn_2_bias_,
-                   ]
-        for i in range(len(weights)):
-            assert weights[i] is not None, "index:" + str(i) + " " + errors
-        return 
 
     def _load_qkvo_weights(self, weights):
         if f"h.{self.layer_num_}.input_layernorm.weight" in weights:
diff --git a/docs/EN/source/models/add_new_model.md b/docs/EN/source/models/add_new_model.md
index 6127dffaf..7417c39cf 100755
--- a/docs/EN/source/models/add_new_model.md
+++ b/docs/EN/source/models/add_new_model.md
@@ -162,18 +162,6 @@ class BloomPreAndPostLayerWeight(PreAndPostLayerWeight):
                                                                  self.tp_rank_: split_vob_size * (self.tp_rank_ + 1), :])
             self.lm_head_weight_ = self.wte_weight_
         return
-    
-    def verify_load(self):
-        errors = "weights load not ok"
-        weights = [self.pre_norm_weight_, 
-                   self.pre_norm_bias_, 
-                   self.final_norm_weight_, 
-                   self.final_norm_bias_,
-                   self.wte_weight_,
-                   self.lm_head_weight_]
-        for i in range(len(weights)):
-            assert weights[i] is not None, "index:" + str(i) + " " + errors
-        return 
 
 ~~~
 
@@ -204,30 +192,6 @@ class BloomTransformerLayerWeight(TransformerLayerWeight):
         self._load_qkvo_weights(weights)
         self._load_ffn_weights(weights)
         return
-    
-    def verify_load(self):
-        errors = "weights load not ok"
-        weights = [self.att_norm_weight_,
-                   self.att_norm_bias_,
-                   self.q_weight_,
-                   self.k_weight_,
-                   self.v_weight_,
-                   self.q_bias_,
-                   self.k_bias_,
-                   self.v_bias_,
-                   self.o_weight_,
-                   self.o_bias_,
-
-                   self.ffn_norm_weight_,
-                   self.ffn_norm_bias_,
-                   self.ffn_1_weight_,
-                   self.ffn_1_bias_,
-                   self.ffn_2_weight_,
-                   self.ffn_2_bias_,
-                   ]
-        for i in range(len(weights)):
-            assert weights[i] is not None, "index:" + str(i) + " " + errors
-        return 
 
     def _load_qkvo_weights(self, weights):
         if f"h.{self.layer_num_}.input_layernorm.weight" in weights:

From 60f8bc86834c991e0bae7a2a951678e2cbf48d45 Mon Sep 17 00:00:00 2001
From: shihaobai <1798930569@qq.com>
Date: Mon, 19 Jan 2026 14:39:50 +0000
Subject: [PATCH 22/65] fix pre-weight

---
 .../models/llama/layer_weights/pre_and_post_layer_weight.py     | 2 +-
 .../qwen3_vl_moe/layer_weights/pre_and_post_layer_weight.py     | 2 +-
 .../starcoder2/layer_weights/pre_and_post_layer_weight.py       | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/lightllm/models/llama/layer_weights/pre_and_post_layer_weight.py b/lightllm/models/llama/layer_weights/pre_and_post_layer_weight.py
index cb540ee4d..8efa36cf8 100644
--- a/lightllm/models/llama/layer_weights/pre_and_post_layer_weight.py
+++ b/lightllm/models/llama/layer_weights/pre_and_post_layer_weight.py
@@ -18,7 +18,7 @@ def __init__(self, data_type, network_config):
         self.lm_head_weight_ = LMHeadWeight(
             dim=hidden_size,
             vocab_size=vocab_size,
-            weight_name="model.embed_tokens.weight",
+            weight_name="lm_head.weight",
             data_type=self.data_type_,
             embedding_weight=self.wte_weight_ if tie_word_embeddings else None,
         )
diff --git a/lightllm/models/qwen3_vl_moe/layer_weights/pre_and_post_layer_weight.py b/lightllm/models/qwen3_vl_moe/layer_weights/pre_and_post_layer_weight.py
index b3fe64ce5..0ba06c6ae 100644
--- a/lightllm/models/qwen3_vl_moe/layer_weights/pre_and_post_layer_weight.py
+++ b/lightllm/models/qwen3_vl_moe/layer_weights/pre_and_post_layer_weight.py
@@ -17,7 +17,7 @@ def __init__(self, data_type, network_config):
         self.lm_head_weight_ = LMHeadWeight(
             dim=hidden_size,
             vocab_size=vocab_size,
-            weight_name="model.language_model.embed_tokens.weight",
+            weight_name="lm_head.weight",
             data_type=self.data_type_,
             embedding_weight=self.wte_weight_ if tie_word_embeddings else None,
         )
diff --git a/lightllm/models/starcoder2/layer_weights/pre_and_post_layer_weight.py b/lightllm/models/starcoder2/layer_weights/pre_and_post_layer_weight.py
index 2071b52cd..cc256c442 100644
--- a/lightllm/models/starcoder2/layer_weights/pre_and_post_layer_weight.py
+++ b/lightllm/models/starcoder2/layer_weights/pre_and_post_layer_weight.py
@@ -17,7 +17,7 @@ def __init__(self, data_type, network_config):
         self.lm_head_weight_ = LMHeadWeight(
             dim=hidden_size,
             vocab_size=vocab_size,
-            weight_name="model.embed_tokens.weight",
+            weight_name="lm_head.weight",
             data_type=self.data_type_,
             embedding_weight=self.wte_weight_ if tie_word_embeddings else None,
         )

From 7e851856fc2b5aee590b1deb1ce086cf980b237e Mon Sep 17 00:00:00 2001
From: shihaobai <1798930569@qq.com>
Date: Tue, 20 Jan 2026 07:28:19 +0000
Subject: [PATCH 23/65] fix deepseek

---
 lightllm/common/basemodel/basemodel.py        |  2 +
 .../layer_weights/base_layer_weight.py        |  9 ++
 .../layer_weights/meta_weights/__init__.py    |  1 +
 .../meta_weights/att_sink_weight.py           |  1 +
 .../layer_weights/meta_weights/base_weight.py |  7 ++
 .../meta_weights/embedding_weight.py          | 15 +++
 .../fused_moe/fused_moe_weight_tp.py          | 27 +++--
 .../meta_weights/mm_weight/__init__.py        |  2 +-
 .../meta_weights/mm_weight/mm_weight.py       | 71 +++++++++++++
 .../meta_weights/mm_weight/rowmm_weight.py    | 39 +++++++-
 .../layer_weights/meta_weights/norm_weight.py | 19 +++-
 lightllm/common/quantization/__init__.py      |  2 +-
 lightllm/common/quantization/deepgemm.py      | 18 +---
 .../layer_weights/transformer_layer_weight.py | 99 +++++++------------
 .../layer_weights/transformer_layer_weight.py |  9 --
 15 files changed, 223 insertions(+), 98 deletions(-)

diff --git a/lightllm/common/basemodel/basemodel.py b/lightllm/common/basemodel/basemodel.py
index 435f39a88..2dcf0c434 100755
--- a/lightllm/common/basemodel/basemodel.py
+++ b/lightllm/common/basemodel/basemodel.py
@@ -184,6 +184,8 @@ def _load_hf_weights(self):
             transformer_layer_list=self.trans_layers_weight,
             weight_dict=self.weight_dict,
         )
+        self.pre_post_weight.verify_load()
+        [weight.verify_load() for weight in self.trans_layers_weight]
         return
 
     def _init_mem_manager(self):
diff --git a/lightllm/common/basemodel/layer_weights/base_layer_weight.py b/lightllm/common/basemodel/layer_weights/base_layer_weight.py
index 6bdeb64d2..1875e2c3b 100644
--- a/lightllm/common/basemodel/layer_weights/base_layer_weight.py
+++ b/lightllm/common/basemodel/layer_weights/base_layer_weight.py
@@ -26,5 +26,14 @@ def init_static_params(self):
         """
         pass
 
+    def verify_load(self):
+        """
+        verify all load is ok
+        """
+        for attr_name in dir(self):
+            attr = getattr(self, attr_name)
+            if isinstance(attr, BaseWeight):
+                assert attr.verify_load(), f"Loading {attr_name} of layers {self.layer_num_} fails."
+
     def _cuda(self, cpu_tensor):
         return cpu_tensor.contiguous().to(self.data_type_).cuda(get_current_device_id())
diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/__init__.py b/lightllm/common/basemodel/layer_weights/meta_weights/__init__.py
index fef70acf5..b67f271ca 100644
--- a/lightllm/common/basemodel/layer_weights/meta_weights/__init__.py
+++ b/lightllm/common/basemodel/layer_weights/meta_weights/__init__.py
@@ -3,6 +3,7 @@
     MMWeightTpl,
     ROWMMWeight,
     KVROWNMMWeight,
+    ROWBMMWeight,
     COLMMWeight,
 )
 from .norm_weight import TpRMSNormWeight, RMSNormWeight, LayerNormWeight, NoTpGEMMANormWeight, QKRMSNORMWeight
diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/att_sink_weight.py b/lightllm/common/basemodel/layer_weights/meta_weights/att_sink_weight.py
index 1c22bcb7d..32d59e66e 100644
--- a/lightllm/common/basemodel/layer_weights/meta_weights/att_sink_weight.py
+++ b/lightllm/common/basemodel/layer_weights/meta_weights/att_sink_weight.py
@@ -10,6 +10,7 @@ def __init__(self, weight_name: str, data_type):
         self.weight_name = weight_name
         self.data_type_ = data_type
         self.weight: torch.Tensor = None
+        # TODO: add create weight function
 
     def load_hf_weights(self, weights: Dict[str, torch.Tensor]):
         if self.weight_name not in weights or self.weight is not None:
diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/base_weight.py b/lightllm/common/basemodel/layer_weights/meta_weights/base_weight.py
index 58860ab30..da0388786 100644
--- a/lightllm/common/basemodel/layer_weights/meta_weights/base_weight.py
+++ b/lightllm/common/basemodel/layer_weights/meta_weights/base_weight.py
@@ -17,6 +17,10 @@ def load_hf_weights(self, weights):
     def _create_weight(self):
         pass
 
+    @abstractmethod
+    def verify_load(self):
+        pass
+
 
 class BaseWeightTpl(BaseWeight):
     def __init__(self, tp_rank: int = None, tp_world_size: int = None, data_type: torch.dtype = None):
@@ -29,5 +33,8 @@ def __init__(self, tp_rank: int = None, tp_world_size: int = None, data_type: to
     def load_hf_weights(self, weights):
         raise NotImplementedError("load_hf_weights must implement this method")
 
+    def verify_load(self):
+        raise NotImplementedError("verify_load must implement this method")
+
     def _create_weight(self) -> bool:
         raise NotImplementedError("create_weight must implement this method")
diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/embedding_weight.py b/lightllm/common/basemodel/layer_weights/meta_weights/embedding_weight.py
index df9050d4f..9737f41b2 100644
--- a/lightllm/common/basemodel/layer_weights/meta_weights/embedding_weight.py
+++ b/lightllm/common/basemodel/layer_weights/meta_weights/embedding_weight.py
@@ -28,6 +28,7 @@ def __init__(self, dim: int, vocab_size: int, weight_name: str, data_type: torch
     def _create_weight(self):
         tp_vocab_size = self.tp_vocab_end_id - self.tp_vocab_start_id
         self.weight: torch.Tensor = torch.empty(tp_vocab_size, self.dim, dtype=self.data_type_, device=self.device_id_)
+        self.load_cnt = 0
 
     def load_hf_weights(self, weights: Dict[str, torch.Tensor]):
         if self.weight_name not in weights:
@@ -40,6 +41,10 @@ def load_hf_weights(self, weights: Dict[str, torch.Tensor]):
         ), f"loaded weight vocab_size: {loaded_vocab_size} != expected vocab_size: {self.vocab_size}"
         logger.info(f"loaded weight vocab_size: {self.vocab_size}")
         self.weight.copy_(t_weight[self.tp_vocab_start_id : self.tp_vocab_end_id, :].to(self.data_type_))
+        self.load_cnt += 1
+
+    def verify_load(self):
+        return self.load_cnt == 1
 
     def _native_forward(
         self, input_ids: torch.Tensor, out: Optional[torch.Tensor] = None, _alloc_func=torch.empty
@@ -109,6 +114,7 @@ def __init__(
         self._create_weight()
 
     def _create_weight(self):
+        self.load_cnt = 0
         if self._embedding_weight is not None:
             self.weight = self._embedding_weight.weight
             return
@@ -128,6 +134,10 @@ def load_hf_weights(self, weights: Dict[str, torch.Tensor]):
         ), f"loaded weight vocab_size: {loaded_vocab_size} != expected vocab_size: {self.vocab_size}"
         logger.info(f"loaded weight vocab_size: {self.vocab_size}")
         self.weight.copy_(t_weight[self.tp_vocab_start_id : self.tp_vocab_end_id, :].to(self.data_type_))
+        self.load_cnt += 1
+
+    def verify_load(self):
+        return self.load_cnt == 1 or self._embedding_weight is not None
 
     def _native_forward(
         self, input: torch.Tensor, out: Optional[torch.Tensor] = None, _alloc_func=torch.empty
@@ -171,6 +181,7 @@ def _create_weight(self):
         self.weight: torch.Tensor = torch.empty(
             self.max_position_embeddings, self.dim, dtype=self.data_type_, device=self.device_id_
         )
+        self.load_cnt = 0
 
     def load_hf_weights(self, weights: Dict[str, torch.Tensor]):
         if self.weight_name not in weights:
@@ -182,6 +193,10 @@ def load_hf_weights(self, weights: Dict[str, torch.Tensor]):
         ), f"max_position_embeddings: {loaded_max_position_embeddings} != expected: {self.max_position_embeddings}"
         logger.info(f"loaded weight max_position_embeddings: {self.max_position_embeddings}")
         self.weight.copy_(t_weight.to(self.data_type_))
+        self.load_cnt += 1
+
+    def verify_load(self):
+        return self.load_cnt == 1
 
     def _native_forward(
         self, input_ids: torch.Tensor, out: Optional[torch.Tensor] = None, _alloc_func=torch.empty
diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/fused_moe_weight_tp.py b/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/fused_moe_weight_tp.py
index 876dc44bd..c7892ab3b 100644
--- a/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/fused_moe_weight_tp.py
+++ b/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/fused_moe_weight_tp.py
@@ -8,6 +8,7 @@
     get_row_slice_mixin,
     get_col_slice_mixin,
 )
+import threading
 
 
 def create_tp_moe_wegiht_obj(
@@ -100,6 +101,7 @@ def __init__(
         self.col_slicer = get_col_slice_mixin(
             self.quant_method.method_name, tp_rank=self.tp_rank_, tp_world_size=self.tp_world_size_
         )
+        self.lock = threading.Lock()
         self._create_weight()
 
     def _create_weight(self):
@@ -107,7 +109,7 @@ def _create_weight(self):
         intermediate_size = self.split_inter_size
 
         # Create e_score_correction_bias
-        if self.e_score_correction_bias is not None:
+        if self.e_score_correction_bias_name is not None:
             self.e_score_correction_bias = torch.empty(
                 (total_expert_num,),
                 dtype=self.data_type_,
@@ -128,6 +130,7 @@ def _create_weight(self):
             device_id=self.device_id_,
             num_experts=total_expert_num,
         )
+        self.load_cnt = 0
 
     def _select_experts(
         self, input_tensor, router_logits, top_k, renormalize, use_grouped_topk, topk_group, num_expert_group
@@ -254,13 +257,18 @@ def load_hf_weights(self, weights):
 
         # Load each expert with TP slicing
         for i_experts in range(self.n_routed_experts):
-            self._load_expert(i_experts, weights, type="weight", suffix=self.quant_method.weight_suffix)
+            with self.lock:
+                self._load_expert(i_experts, weights, type="weight", suffix=self.quant_method.weight_suffix)
             if self.w13.weight_scale is not None:
-                self._load_expert(i_experts, weights, type="weight_scale", suffix=self.quant_method.weight_scale_suffix)
+                with self.lock:
+                    self._load_expert(
+                        i_experts, weights, type="weight_scale", suffix=self.quant_method.weight_scale_suffix
+                    )
             if self.w13.weight_zero_point is not None:
-                self._load_expert(
-                    i_experts, weights, type="weight_zero_point", suffix=self.quant_method.weight_zero_point_suffix
-                )
+                with self.lock:
+                    self._load_expert(
+                        i_experts, weights, type="weight_zero_point", suffix=self.quant_method.weight_zero_point_suffix
+                    )
 
     def _load_weight_func(self, weight: torch.Tensor, weight_pack: WeightPack, start_idx: int = 0):
         if self.quant_method.weight_need_quanted(weight):
@@ -276,12 +284,17 @@ def _load_expert(self, expert_idx, weights, type: str, suffix: str = "weight"):
         load_func, slice_func = self._get_load_and_slice_func(type, is_row=True)
         if w1_weight in weights:
             load_func(slice_func(weights[w1_weight]), self.w13.get_expert(expert_idx), start_idx=0)
+            self.load_cnt += 1
         if w3_weight in weights:
             load_func(slice_func(weights[w3_weight]), self.w13.get_expert(expert_idx), start_idx=intermediate_size)
-
+            self.load_cnt += 1
         load_func, slice_func = self._get_load_and_slice_func(type, is_row=False)
         if w2_weight in weights:
             load_func(slice_func(weights[w2_weight]), self.w2.get_expert(expert_idx), start_idx=0)
+            self.load_cnt += 1
+
+    def verify_load(self):
+        return self.load_cnt == self.n_routed_experts * 3 * 2
 
     def _get_load_and_slice_func(self, type: str, is_row: bool = True):
         if is_row:
diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/__init__.py b/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/__init__.py
index ae0c65197..e9ae4f30a 100644
--- a/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/__init__.py
+++ b/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/__init__.py
@@ -1,5 +1,5 @@
 from .mm_weight import (
     MMWeightTpl,
 )
-from .rowmm_weight import ROWMMWeight, KVROWNMMWeight
+from .rowmm_weight import ROWMMWeight, KVROWNMMWeight, ROWBMMWeight
 from .colmm_weight import COLMMWeight
diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/mm_weight.py b/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/mm_weight.py
index 728ed82fa..3ba4d3e59 100644
--- a/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/mm_weight.py
+++ b/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/mm_weight.py
@@ -127,6 +127,7 @@ def _create_weight(self):
         self.mm_param: WeightPack = self.quant_method.create_weight(
             in_dim=self.in_dim, out_dim=sum(self.out_dims), dtype=self.data_type_, device_id=get_current_device_id()
         )
+        self.load_cnt = 0
         return
 
     # 执行顺序
@@ -140,6 +141,7 @@ def _load_weight(
                 self.quant_method.quantize(weight, self.mm_param, offset=start_idx)
             else:
                 self.quant_method.load_weight(weight, self.mm_param, start_idx)
+            self.load_cnt += 1
         return
 
     def _load_bias(
@@ -159,6 +161,7 @@ def _load_weight_scale(
             weight_scale = self.param_slicer._slice_weight_scale(weights[param_name])
             start_idx = self.cusum_out_dims[sub_child_index]
             self.quant_method.load_weight_scale(weight_scale, self.mm_param, start_idx)
+            self.load_cnt += 1
         return
 
     def _load_weight_zero_point(
@@ -168,10 +171,78 @@ def _load_weight_zero_point(
             weight_zero_point = self.param_slicer._slice_weight_zero_point(weights[param_name])
             start_idx = self.cusum_out_dims[sub_child_index]
             self.quant_method.load_weight_zero_point(weight_zero_point, self.mm_param, start_idx)
+            self.load_cnt += 1
         return
 
+    def verify_load(self):
+        if self.quant_method.method_name != "none":
+            return self.load_cnt == len(self.weight_names) * 2
+        else:
+            return self.load_cnt == len(self.weight_names)
+
     def _get_tp_dim(self, dim: int) -> int:
         assert (
             dim % self.tp_world_size_ == 0
         ), f"dim must be divisible by tp_world_size_, but found: {dim} % {self.tp_world_size_}"
         return dim // self.tp_world_size_
+
+
+class BMMWeightTpl(BaseWeightTpl):
+    def __init__(
+        self,
+        dim0: int,
+        dim1: int,
+        dim2: int,
+        weight_names: Union[str, List[str]],
+        data_type: torch.dtype,
+        bias_names: Optional[Union[str, List[str]]] = None,
+        quant_method: QuantizationMethod = None,
+        tp_rank: int = None,
+        tp_world_size: int = None,
+    ) -> None:
+        super().__init__(tp_rank, tp_world_size, data_type)
+        if isinstance(weight_names, str):
+            weight_names = [weight_names]
+        self.weight_names = weight_names
+        self.bias_names = bias_names
+        assert bias_names is None, "bmm not support bias"
+        if isinstance(bias_names, list):
+            assert all(bias_name is None for bias_name in bias_names), "bmm not support bias"
+        assert quant_method is None, "bmm not support quantized weight"
+        self.quant_method = quant_method
+        self.dim0 = dim0
+        self.dim1 = dim1
+        self.dim2 = dim2
+        self._create_weight()
+        return
+
+    def _create_weight(self):
+        self.weight = torch.empty(self.dim0, self.dim1, self.dim2, dtype=self.data_type_).cuda(get_current_device_id())
+        self.load_cnt = 0
+        return
+
+    def load_hf_weights(self, weights: Dict[str, torch.Tensor]):
+        for weight_name in self.weight_names:
+            if weight_name in weights:
+                weight = self.param_slicer._slice_weight(weights[weight_name])
+                self.weight.copy_(weight)
+                self.load_cnt += 1
+        return
+
+    def verify_load(self):
+        return self.load_cnt == len(self.weight_names)
+
+    def bmm(
+        self, input_tensor: torch.Tensor, out: Optional[torch.Tensor] = None, use_custom_tensor_mananger: bool = True
+    ) -> torch.Tensor:
+        # 目前 bmm 不支持量化运算操作
+        fpweight = self.weight
+        if out is None:
+            shape = (input_tensor.shape[0], input_tensor.shape[1], fpweight.shape[2])
+            dtype = input_tensor.dtype
+            device = input_tensor.device
+            if use_custom_tensor_mananger:
+                out = g_cache_manager.alloc_tensor(shape, dtype, device=device)
+            else:
+                out = torch.empty(shape, dtype=dtype, device=device)
+        return torch.bmm(input_tensor, fpweight, out=out)
diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/rowmm_weight.py b/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/rowmm_weight.py
index d7554b375..30a699bb6 100644
--- a/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/rowmm_weight.py
+++ b/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/rowmm_weight.py
@@ -1,7 +1,5 @@
 import torch
-from lightllm.common.basemodel.layer_weights.meta_weights.mm_weight.mm_weight import (
-    MMWeightTpl,
-)
+from lightllm.common.basemodel.layer_weights.meta_weights.mm_weight.mm_weight import MMWeightTpl, BMMWeightTpl
 from lightllm.common.quantization import Quantcfg
 from lightllm.utils.dist_utils import get_current_device_id
 from lightllm.common.quantization.quantize_method import QuantizationMethod
@@ -92,3 +90,38 @@ def _get_tp_padded_head_num(self, head_num: int):
                 f"tp_world_size_ must be divisible by head_num, "
                 f"but found: {head_num} % {self.tp_world_size_}"
             )
+
+
+class ROWBMMWeight(BMMWeightTpl):
+    def __init__(
+        self,
+        dim0: int,
+        dim1: int,
+        dim2: int,
+        weight_names: Union[str, List[str]],
+        data_type: torch.dtype,
+        bias_names: Optional[Union[str, List[str]]] = None,
+        quant_method: QuantizationMethod = None,
+        tp_rank: int = None,
+        tp_world_size: int = None,
+    ) -> None:
+        self.tp_rank_ = tp_rank if tp_rank is not None else get_current_rank_in_dp()
+        self.tp_world_size_ = tp_world_size if tp_world_size is not None else get_dp_world_size()
+        assert (
+            dim0 % self.tp_world_size_ == 0
+        ), f"dim0 of bmm must be divisible by tp_world_size_, but found: {dim0} % {self.tp_world_size_}"
+        dim0 = dim0 // self.tp_world_size_
+        super().__init__(
+            dim0=dim0,
+            dim1=dim1,
+            dim2=dim2,
+            weight_names=weight_names,
+            bias_names=bias_names,
+            data_type=data_type,
+            quant_method=quant_method,
+            tp_rank=self.tp_rank_,
+            tp_world_size=self.tp_world_size_,
+        )
+        self.param_slicer = get_row_slice_mixin(
+            quant_method_name="none", tp_rank=self.tp_rank_, tp_world_size=self.tp_world_size_
+        )
diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/norm_weight.py b/lightllm/common/basemodel/layer_weights/meta_weights/norm_weight.py
index d7bbe5567..1a8f59723 100644
--- a/lightllm/common/basemodel/layer_weights/meta_weights/norm_weight.py
+++ b/lightllm/common/basemodel/layer_weights/meta_weights/norm_weight.py
@@ -19,10 +19,15 @@ def __init__(self, dim: int, weight_name: str, data_type: torch.dtype, bias_name
 
     def _create_weight(self):
         self.weight: torch.Tensor = torch.empty(self.dim, dtype=self.data_type_, device=self.device_id_)
+        self.load_cnt = 0
 
     def load_hf_weights(self, weights: Dict[str, torch.Tensor]):
         if self.weight_name in weights:
             self.weight.copy_(weights[self.weight_name])
+            self.load_cnt += 1
+
+    def verify_load(self):
+        return self.load_cnt == 1
 
     def _native_forward(
         self, input: torch.Tensor, eps: float, out: Optional[torch.Tensor] = None, alloc_func=torch.empty
@@ -42,7 +47,9 @@ def _native_forward(
     def _triton_forward(
         self, input: torch.Tensor, eps: float, out: Optional[torch.Tensor] = None, alloc_func=torch.empty
     ) -> torch.Tensor:
-        assert input.ndim == 2 and self.weight.ndim == 1
+        assert (
+            input.ndim in [2, 3] and self.weight.ndim == 1
+        ), f"input.ndim: {input.ndim} != 2 or weight.ndim: {self.weight.ndim} != 1"
         if out is None:
             out = alloc_func(input.shape, dtype=input.dtype, device=input.device)
         return rmsnorm_forward(x=input, weight=self.weight, eps=eps, out=out)
@@ -77,12 +84,18 @@ def __init__(self, dim: int, weight_name: str, data_type: torch.dtype, bias_name
     def _create_weight(self):
         self.weight: torch.Tensor = torch.empty(self.dim, dtype=self.data_type_, device=self.device_id_)
         self.bias: torch.Tensor = torch.empty(self.dim, dtype=self.data_type_, device=self.device_id_)
+        self.load_cnt = 0
 
     def load_hf_weights(self, weights: Dict[str, torch.Tensor]):
         if self.weight_name in weights:
             self.weight.copy_(weights[self.weight_name])
+            self.load_cnt += 1
         if self.bias_name in weights:
             self.bias.copy_(weights[self.bias_name])
+            self.load_cnt += 1
+
+    def verify_load(self):
+        return self.load_cnt == 2
 
     def _native_forward(
         self, input: torch.Tensor, eps: float, out: Optional[torch.Tensor] = None, alloc_func=torch.empty
@@ -162,6 +175,7 @@ def load_hf_weights(self, weights):
             self.weight[:, end - start].copy_(t_weight[start:end].to(self.data_type_))
             # the padding part is zero
             self.weight[:, end:].zero_()
+            self.load_cnt += 1
 
 
 class NoTpGEMMANormWeight(RMSNormWeight):
@@ -173,7 +187,8 @@ def __init__(self, dim: int, weight_name: str, data_type: torch.dtype, bias_name
     def load_hf_weights(self, weights: Dict[str, torch.Tensor]):
         if self.weight_name in weights:
             self.weight.copy_(weights[self.weight_name])
-        self.weight += 1
+            self.weight += 1
+            self.load_cnt += 1
 
 
 class QKRMSNORMWeight(RMSNormWeight):
diff --git a/lightllm/common/quantization/__init__.py b/lightllm/common/quantization/__init__.py
index bf99622ef..1e4745449 100644
--- a/lightllm/common/quantization/__init__.py
+++ b/lightllm/common/quantization/__init__.py
@@ -36,7 +36,7 @@ def _mapping_quant_method(self):
         if self.hf_quantization_method == "fp8":
             block_size = self.hf_quantization_config.get("weight_block_size", None)
             if block_size == [128, 128]:
-                from lightllm.common.quantization.deepgemm_quant import HAS_DEEPGEMM
+                from lightllm.common.quantization.deepgemm import HAS_DEEPGEMM
 
                 if HAS_DEEPGEMM:
                     self.quant_type = "deepgemm-fp8w8a8-b128"
diff --git a/lightllm/common/quantization/deepgemm.py b/lightllm/common/quantization/deepgemm.py
index c9f227120..80be14c33 100644
--- a/lightllm/common/quantization/deepgemm.py
+++ b/lightllm/common/quantization/deepgemm.py
@@ -48,7 +48,7 @@ class DeepGEMMFP8w8a8B128QuantizationMethod(DeepGEMMBaseQuantizationMethod):
     def __init__(self):
         super().__init__()
         self.block_size = 128
-        self.weight_suffix = None
+        self.weight_suffix = "weight"
         self.weight_zero_point_suffix = None
         self.weight_scale_suffix = "weight_scale_inv"
         self.has_weight_scale = True
@@ -102,9 +102,9 @@ def create_weight(
     ) -> WeightPack:
         expert_prefix = (num_experts,) if num_experts > 1 else ()
         weight = torch.empty(expert_prefix + (out_dim, in_dim), dtype=torch.float8_e4m3fn).cuda(device_id)
-        weight_scale = torch.empty(
-            expert_prefix + (out_dim // self.block_size, in_dim // self.block_size), dtype=torch.float32
-        ).cuda(device_id)
+        scale_out_dim = (out_dim + self.block_size - 1) // self.block_size
+        scale_in_dim = (in_dim + self.block_size - 1) // self.block_size
+        weight_scale = torch.empty(expert_prefix + (scale_out_dim, scale_in_dim), dtype=torch.float32).cuda(device_id)
         return WeightPack(weight=weight, weight_scale=weight_scale)
 
     def load_weight(self, weight: torch.Tensor, weight_pack: WeightPack, start_idx: int) -> None:
@@ -112,15 +112,7 @@ def load_weight(self, weight: torch.Tensor, weight_pack: WeightPack, start_idx:
         return
 
     def load_weight_scale(self, weight_scale: torch.Tensor, weight_pack: WeightPack, start_idx: int) -> None:
-        weight_pack.weight_scale[
-            start_idx // self.block_size : start_idx + weight_scale.shape[0] // self.block_size
-        ].copy_(weight_scale)
-        return
-
-    def load_weight_zero_point(self, weight_zero_point: torch.Tensor, weight_pack: WeightPack, start_idx: int) -> None:
-        weight_pack.weight_zero_point[
-            start_idx // self.block_size : start_idx + weight_zero_point.shape[0] // self.block_size
-        ].copy_(weight_zero_point)
+        weight_pack.weight_scale[start_idx // self.block_size : start_idx + weight_scale.shape[0]].copy_(weight_scale)
         return
 
 
diff --git a/lightllm/models/deepseek2/layer_weights/transformer_layer_weight.py b/lightllm/models/deepseek2/layer_weights/transformer_layer_weight.py
index d927f22d1..1e8d572e1 100644
--- a/lightllm/models/deepseek2/layer_weights/transformer_layer_weight.py
+++ b/lightllm/models/deepseek2/layer_weights/transformer_layer_weight.py
@@ -6,6 +6,7 @@
 from lightllm.utils.envs_utils import enable_env_vars, get_env_start_args
 from lightllm.common.basemodel.layer_weights.meta_weights import (
     ROWMMWeight,
+    ROWBMMWeight,
     COLMMWeight,
     RMSNormWeight,
     FusedMoeWeightEP,
@@ -65,31 +66,14 @@ def _init_weight(self):
             self._init_ffn()
         self._init_norm()
 
-    def _load_kb(self, kv_b_proj_):
-        k_b_proj_ = kv_b_proj_.view(self.num_attention_heads, self.qk_nope_head_dim * 2, self.kv_lora_rank)[
-            :, : self.qk_nope_head_dim, :
-        ]
-        return k_b_proj_.contiguous().to(kv_b_proj_.dtype)
-
-    def _load_kb_scale(self, kv_b_proj_, block_size):
-        k_b_proj_scale_ = kv_b_proj_.view(
-            self.num_attention_heads, self.qk_nope_head_dim * 2 // block_size, self.kv_lora_rank // block_size
-        )[:, : self.qk_nope_head_dim // block_size, :]
-        return k_b_proj_scale_.contiguous().to(kv_b_proj_.dtype)
-
-    def _load_vb(self, kv_b_proj_):
-        v_b_proj_ = kv_b_proj_.T.view(self.kv_lora_rank, self.num_attention_heads, self.qk_nope_head_dim * 2,)[
-            :, :, self.qk_nope_head_dim :
-        ].transpose(0, 1)
-        return v_b_proj_.contiguous().to(kv_b_proj_.dtype)
-
-    def _load_vb_scale(self, kv_b_proj_scale_, block_size):
-        v_b_proj_scale_ = kv_b_proj_scale_.T.view(
-            self.kv_lora_rank // block_size,
-            self.num_attention_heads,
-            self.qk_nope_head_dim * 2 // block_size,
-        )[:, :, self.qk_nope_head_dim // block_size :].transpose(0, 1)
-        return v_b_proj_scale_.contiguous().to(kv_b_proj_scale_.dtype)
+    def _split_kv_b_proj(self, kv_b_proj_):
+        kv_b_proj_ = kv_b_proj_.view(self.num_attention_heads, self.qk_nope_head_dim * 2, self.kv_lora_rank)
+        k_b_proj_, v_b_proj_ = torch.split(kv_b_proj_, [self.qk_nope_head_dim, self.v_head_dim], dim=-2)
+        # num_attention_heads x qk_nope_head_dim x kv_lora_rank
+        k_b_proj_ = k_b_proj_.contiguous().to(kv_b_proj_.dtype)
+        # num_attention_heads x kv_lora_rank x v_head_dim
+        v_b_proj_ = v_b_proj_.transpose(1, 2).contiguous().to(kv_b_proj_.dtype)
+        return k_b_proj_, v_b_proj_
 
     def _rename_shared_experts(self, weights, weight_scale_suffix):
         # 将共享专家对应的参数，改造为与路由专家一致的权重名称和映射关系。
@@ -122,21 +106,9 @@ def load_hf_weights(self, weights):
                     kv_b_proj_.cuda(),
                     weights[f"model.layers.{self.layer_num_}.self_attn.kv_b_proj." + weight_scale_suffix].cuda(),
                 ).cpu()
-            weights[f"model.layers.{self.layer_num_}.self_attn.k_b_proj.weight"] = self._load_kb(kv_b_proj_)
-            weights[f"model.layers.{self.layer_num_}.self_attn.v_b_proj.weight"] = self._load_vb(kv_b_proj_)
-
-        if (
-            self.quant_cfg.quantized_weight
-            and f"model.layers.{self.layer_num_}.self_attn.kv_b_proj." + weight_scale_suffix in weights
-        ):
-            kv_b_proj_scale_ = weights[f"model.layers.{self.layer_num_}.self_attn.kv_b_proj." + weight_scale_suffix]
-            block_size = 128
-            weights[f"model.layers.{self.layer_num_}.self_attn.k_b_proj." + weight_scale_suffix] = self._load_kb_scale(
-                kv_b_proj_scale_, block_size
-            )
-            weights[f"model.layers.{self.layer_num_}.self_attn.v_b_proj." + weight_scale_suffix] = self._load_vb_scale(
-                kv_b_proj_scale_, block_size
-            )
+            k_b_proj_, v_b_proj_ = self._split_kv_b_proj(kv_b_proj_)
+            weights[f"model.layers.{self.layer_num_}.self_attn.k_b_proj.weight"] = k_b_proj_
+            weights[f"model.layers.{self.layer_num_}.self_attn.v_b_proj.weight"] = v_b_proj_
 
         # rename the shared experts weight
         if self.num_fused_shared_experts > 0:
@@ -181,20 +153,22 @@ def _init_qkvo(self):
                 data_type=self.data_type_,
                 quant_method=self.get_quant_method("q_b_proj"),
             )
-        # self.k_b_proj_ = ROWBMMWeight(
-        #     weight_names=f"model.layers.{self.layer_num_}.self_attn.k_b_proj.weight",
-        #     data_type=self.data_type_,
-        #     quant_cfg=None,
-        #     layer_num=self.layer_num_,
-        #     name="k_b_proj",
-        # )
-        # self.v_b_proj_ = ROWBMMWeight(
-        #     weight_names=f"model.layers.{self.layer_num_}.self_attn.v_b_proj.weight",
-        #     data_type=self.data_type_,
-        #     quant_cfg=None,
-        #     layer_num=self.layer_num_,
-        #     name="v_b_proj",
-        # )
+        self.k_b_proj_ = ROWBMMWeight(
+            dim0=self.num_attention_heads,
+            dim1=self.qk_nope_head_dim,
+            dim2=self.kv_lora_rank,
+            weight_names=f"model.layers.{self.layer_num_}.self_attn.k_b_proj.weight",
+            data_type=self.data_type_,
+            quant_method=None,
+        )
+        self.v_b_proj_ = ROWBMMWeight(
+            dim0=self.num_attention_heads,
+            dim1=self.kv_lora_rank,
+            dim2=self.v_head_dim,
+            weight_names=f"model.layers.{self.layer_num_}.self_attn.v_b_proj.weight",
+            data_type=self.data_type_,
+            quant_method=None,
+        )
         if self.enable_cc_method:
             self.cc_kv_b_proj_ = ROWMMWeight(
                 in_dim=self.kv_lora_rank,
@@ -212,12 +186,13 @@ def _init_qkvo(self):
             quant_method=self.get_quant_method("o_weight"),
         )
 
-    def _load_mlp(self, mlp_prefix):
+    def _load_mlp(self, mlp_prefix, is_shared_experts=False):
         moe_mode = os.getenv("MOE_MODE", "TP")
+        mlp_inter = self.moe_inter if is_shared_experts else self.n_inter
         if self.is_moe and moe_mode == "EP":
             self.gate_up_proj = ROWMMWeight(
                 in_dim=self.n_embed,
-                out_dims=[self.moe_inter, self.moe_inter],
+                out_dims=[mlp_inter, mlp_inter],
                 weight_names=[f"{mlp_prefix}.gate_proj.weight", f"{mlp_prefix}.up_proj.weight"],
                 data_type=self.data_type_,
                 quant_method=self.get_quant_method("gate_up_proj"),
@@ -225,7 +200,7 @@ def _load_mlp(self, mlp_prefix):
                 tp_world_size=1,
             )
             self.down_proj = COLMMWeight(
-                in_dim=self.moe_inter,
+                in_dim=mlp_inter,
                 out_dims=[self.n_embed],
                 weight_names=f"{mlp_prefix}.down_proj.weight",
                 data_type=self.data_type_,
@@ -236,13 +211,13 @@ def _load_mlp(self, mlp_prefix):
         else:
             self.gate_up_proj = ROWMMWeight(
                 in_dim=self.n_embed,
-                out_dims=[self.n_inter, self.n_inter],
+                out_dims=[mlp_inter, mlp_inter],
                 weight_names=[f"{mlp_prefix}.gate_proj.weight", f"{mlp_prefix}.up_proj.weight"],
                 data_type=self.data_type_,
                 quant_method=self.get_quant_method("gate_up_proj"),
             )
             self.down_proj = COLMMWeight(
-                in_dim=self.n_inter,
+                in_dim=mlp_inter,
                 out_dims=[self.n_embed],
                 weight_names=f"{mlp_prefix}.down_proj.weight",
                 data_type=self.data_type_,
@@ -256,7 +231,7 @@ def _init_moe(self):
             out_dims=[self.n_routed_experts],
             weight_names=f"model.layers.{self.layer_num_}.mlp.gate.weight",
             data_type=self.data_type_,
-            quant_method=self.get_quant_method("moe_gate"),
+            quant_method=None,
             tp_rank=0,
             tp_world_size=1,
         )
@@ -267,7 +242,7 @@ def _init_moe(self):
         # 专家对应的 gate_up_proj 等weight 参数。当 num_fused_shared_experts
         # == 0 时，说明不存在融合共享专家，共享专家单独加载和进行推理。
         if self.num_fused_shared_experts == 0:
-            self._load_mlp(f"model.layers.{self.layer_num_}.mlp.shared_experts")
+            self._load_mlp(f"model.layers.{self.layer_num_}.mlp.shared_experts", is_shared_experts=True)
         moe_mode = os.getenv("MOE_MODE", "TP")
         assert moe_mode in ["EP", "TP"]
         if moe_mode == "TP":
@@ -318,7 +293,7 @@ def _init_norm(self):
             data_type=self.data_type_,
         )
         self.kv_a_layernorm_ = RMSNormWeight(
-            dim=self.kv_lora_rank + self.qk_rope_head_dim,
+            dim=self.kv_lora_rank,
             weight_name=f"model.layers.{self.layer_num_}.self_attn.kv_a_layernorm.weight",
             data_type=self.data_type_,
         )
diff --git a/lightllm/models/qwen3_moe/layer_weights/transformer_layer_weight.py b/lightllm/models/qwen3_moe/layer_weights/transformer_layer_weight.py
index 17023d0cb..54cf7f02d 100644
--- a/lightllm/models/qwen3_moe/layer_weights/transformer_layer_weight.py
+++ b/lightllm/models/qwen3_moe/layer_weights/transformer_layer_weight.py
@@ -32,15 +32,6 @@ def _init_weight_names(self):
         self._ffn_norm_weight_name = f"model.layers.{self.layer_num_}.post_attention_layernorm.weight"
         self._ffn_norm_bias_name = None
 
-    def load_hf_weights(self, weights):
-        kv_b_quant_method = self.quant_cfg.get_quant_method(self.layer_num_, "kv_b_proj")
-        if self.quant_cfg.quantized_weight:
-            _k_scale_weight_name = self._k_weight_name.replace("weight", kv_b_quant_method.weight_scale_suffix)
-            self._repeat_weight(_k_scale_weight_name, weights)
-            _v_scale_weight_name = self._v_weight_name.replace("weight", kv_b_quant_method.weight_scale_suffix)
-            self._repeat_weight(_v_scale_weight_name, weights)
-        return super().load_hf_weights(weights)
-
     def _init_weight(self):
         self._init_qkv()
         self._init_o()

From ff76f575cd28b143be178c15bd0c83bcee98d962 Mon Sep 17 00:00:00 2001
From: shihaobai <1798930569@qq.com>
Date: Tue, 20 Jan 2026 14:16:46 +0000
Subject: [PATCH 24/65] fix unitest

---
 .../common/fused_moe/test_moe_silu_and_mul_mix_quant_ep.py      | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unit_tests/common/fused_moe/test_moe_silu_and_mul_mix_quant_ep.py b/unit_tests/common/fused_moe/test_moe_silu_and_mul_mix_quant_ep.py
index 671805a3d..ab2cc4976 100644
--- a/unit_tests/common/fused_moe/test_moe_silu_and_mul_mix_quant_ep.py
+++ b/unit_tests/common/fused_moe/test_moe_silu_and_mul_mix_quant_ep.py
@@ -16,7 +16,7 @@ def is_fp8_native_supported():
 import random
 from lightllm.common.fused_moe.moe_silu_and_mul_mix_quant_ep import silu_and_mul_masked_post_quant_fwd
 from lightllm.common.fused_moe.moe_silu_and_mul import silu_and_mul_fwd
-from lightllm.common.quantization.triton_quant.fp8.fp8act_quant_kernel import per_token_group_quant_fp8
+from lightllm.common.basemodel.triton_kernel.quantization.fp8act_quant_kernel import per_token_group_quant_fp8
 from lightllm.utils.log_utils import init_logger
 
 logger = init_logger(__name__)

From 67c5823b0ef4b39d659af354982d0f07bc1abf42 Mon Sep 17 00:00:00 2001
From: shihaobai <1798930569@qq.com>
Date: Thu, 22 Jan 2026 14:48:20 +0000
Subject: [PATCH 25/65] refactor fuse_moe

---
 lightllm/common/basemodel/basemodel.py        |   6 +-
 .../layer_weights/meta_weights/__init__.py    |   2 +-
 .../meta_weights/embedding_weight.py          |  24 +-
 .../fused_moe/fused_moe_weight.py             | 315 +++++++++++++++
 .../fused_moe/fused_moe_weight_ep.py          |  59 +--
 .../fused_moe_weight_ep_redundancy.py         |   4 +-
 .../fused_moe/fused_moe_weight_tp.py          | 364 ------------------
 .../fused_moe/gpt_oss_fused_moe_weight_tp.py  |  10 +-
 .../meta_weights/fused_moe/impl/__init__.py   |  14 +
 .../meta_weights/fused_moe/impl/base_impl.py  |  55 +++
 .../fused_moe/impl/deepgemm_impl.py           | 336 ++++++++++++++++
 .../fused_moe/impl/marlin_impl.py             |  56 +++
 .../fused_moe/impl/triton_impl.py             | 138 +++++++
 .../meta_weights/mm_weight/mm_weight.py       |  40 +-
 .../layer_weights/meta_weights/norm_weight.py |  17 +-
 .../triton_kernel}/fused_moe/__init__.py      |   0
 .../fused_moe/deepep_scatter_gather.py        |   0
 .../fused_moe/grouped_fused_moe.py            |   0
 .../fused_moe/grouped_fused_moe_ep.py         |   8 +-
 .../triton_kernel}/fused_moe/grouped_topk.py  |   0
 .../fused_moe/moe_kernel_configs.py           |   0
 .../fused_moe/moe_silu_and_mul.py             |   0
 .../fused_moe/moe_silu_and_mul_config.py      |   0
 .../moe_silu_and_mul_mix_quant_ep.py          |   0
 .../fused_moe/moe_sum_recude_config.py        |   0
 .../fused_moe/moe_sum_reduce.py               |   0
 .../triton_kernel}/fused_moe/softmax_topk.py  |   0
 .../triton_kernel}/fused_moe/topk_select.py   |   6 +-
 .../common/quantization/quantize_method.py    |   4 +
 lightllm/distributed/communication_op.py      |   4 +-
 .../layer_infer/transformer_layer_infer.py    |   4 +-
 .../layer_weights/transformer_layer_weight.py |  60 +--
 .../layer_weights/transformer_layer_weight.py |   5 +-
 .../layer_infer/transformer_layer_infer.py    |   2 +-
 .../layer_weights/transformer_layer_weight.py |  41 +-
 .../layer_infer/transformer_layer_infer.py    |   5 +-
 .../layer_weights/transformer_layer_weight.py |  48 +--
 .../transformers_layer_weight.py              |   1 -
 .../layer_weights/transformer_layer_weight.py |   1 -
 lightllm/server/api_cli.py                    |   7 +-
 test/start_scripts/README.md                  |   2 +-
 test/start_scripts/multi_node_ep_node0.sh     |   4 +-
 test/start_scripts/multi_node_ep_node1.sh     |   4 +-
 .../multi_pd_master/pd_prefill.sh             |   5 +-
 test/start_scripts/single_node_ep.sh          |   5 +-
 .../single_pd_master/pd_decode.sh             |   3 +-
 .../single_pd_master/pd_nixl_decode.sh        |   3 +-
 .../single_pd_master/pd_nixl_prefill.sh       |   3 +-
 .../single_pd_master/pd_prefill.sh            |   5 +-
 49 files changed, 1066 insertions(+), 604 deletions(-)
 create mode 100644 lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/fused_moe_weight.py
 delete mode 100644 lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/fused_moe_weight_tp.py
 create mode 100644 lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/impl/__init__.py
 create mode 100644 lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/impl/base_impl.py
 create mode 100644 lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/impl/deepgemm_impl.py
 create mode 100644 lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/impl/marlin_impl.py
 create mode 100644 lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/impl/triton_impl.py
 rename lightllm/common/{ => basemodel/triton_kernel}/fused_moe/__init__.py (100%)
 rename lightllm/common/{ => basemodel/triton_kernel}/fused_moe/deepep_scatter_gather.py (100%)
 rename lightllm/common/{ => basemodel/triton_kernel}/fused_moe/grouped_fused_moe.py (100%)
 rename lightllm/common/{ => basemodel/triton_kernel}/fused_moe/grouped_fused_moe_ep.py (96%)
 rename lightllm/common/{ => basemodel/triton_kernel}/fused_moe/grouped_topk.py (100%)
 rename lightllm/common/{ => basemodel/triton_kernel}/fused_moe/moe_kernel_configs.py (100%)
 rename lightllm/common/{ => basemodel/triton_kernel}/fused_moe/moe_silu_and_mul.py (100%)
 rename lightllm/common/{ => basemodel/triton_kernel}/fused_moe/moe_silu_and_mul_config.py (100%)
 rename lightllm/common/{ => basemodel/triton_kernel}/fused_moe/moe_silu_and_mul_mix_quant_ep.py (100%)
 rename lightllm/common/{ => basemodel/triton_kernel}/fused_moe/moe_sum_recude_config.py (100%)
 rename lightllm/common/{ => basemodel/triton_kernel}/fused_moe/moe_sum_reduce.py (100%)
 rename lightllm/common/{ => basemodel/triton_kernel}/fused_moe/softmax_topk.py (100%)
 rename lightllm/common/{ => basemodel/triton_kernel}/fused_moe/topk_select.py (96%)

diff --git a/lightllm/common/basemodel/basemodel.py b/lightllm/common/basemodel/basemodel.py
index 2dcf0c434..e6405e4d7 100755
--- a/lightllm/common/basemodel/basemodel.py
+++ b/lightllm/common/basemodel/basemodel.py
@@ -637,8 +637,6 @@ def microbatch_overlap_prefill(self, model_input0: ModelInput, model_input1: Mod
 
         assert model_input0.mem_indexes.is_cuda
         assert model_input1.mem_indexes.is_cuda
-        input_ids0, input_ids1 = model_input0.input_ids, model_input1.input_ids
-
         infer_state0 = self._create_inferstate(model_input0, 0)
         init_req_to_token_indexes(
             req_to_token_indexs=self.req_manager.req_to_token_indexs,
@@ -668,9 +666,7 @@ def microbatch_overlap_prefill(self, model_input0: ModelInput, model_input1: Mod
         prefill_mem_indexes_ready_event = torch.cuda.Event()
         prefill_mem_indexes_ready_event.record()
 
-        model_output0, model_output1 = self._overlap_tpsp_context_forward(
-            input_ids0, infer_state0, input_ids1=input_ids1, infer_state1=infer_state1
-        )
+        model_output0, model_output1 = self._overlap_tpsp_context_forward(infer_state0, infer_state1=infer_state1)
 
         # 在开启使用deepep的时候，需要调用clear_deepep_buffer做资源清理，没有启用的时候
         # 该调用没有实际意义
diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/__init__.py b/lightllm/common/basemodel/layer_weights/meta_weights/__init__.py
index b67f271ca..ab0e5b604 100644
--- a/lightllm/common/basemodel/layer_weights/meta_weights/__init__.py
+++ b/lightllm/common/basemodel/layer_weights/meta_weights/__init__.py
@@ -9,5 +9,5 @@
 from .norm_weight import TpRMSNormWeight, RMSNormWeight, LayerNormWeight, NoTpGEMMANormWeight, QKRMSNORMWeight
 from .embedding_weight import EmbeddingWeight, LMHeadWeight, NoTpPosEmbeddingWeight
 from .att_sink_weight import TpAttSinkWeight
-from .fused_moe.fused_moe_weight_tp import create_tp_moe_wegiht_obj
 from .fused_moe.fused_moe_weight_ep import FusedMoeWeightEP
+from .fused_moe.fused_moe_weight import FusedMoeWeight
diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/embedding_weight.py b/lightllm/common/basemodel/layer_weights/meta_weights/embedding_weight.py
index 9737f41b2..d4e03d0a1 100644
--- a/lightllm/common/basemodel/layer_weights/meta_weights/embedding_weight.py
+++ b/lightllm/common/basemodel/layer_weights/meta_weights/embedding_weight.py
@@ -5,9 +5,6 @@
 from .platform_op import PlatformAwareOp
 from lightllm.common.basemodel.triton_kernel.embedding import embedding as embedding_kernel
 from lightllm.utils.dist_utils import get_dp_world_size, get_current_rank_in_dp
-from lightllm.utils.log_utils import init_logger
-
-logger = init_logger(__name__)
 
 
 class EmbeddingWeight(BaseWeightTpl, PlatformAwareOp):
@@ -28,7 +25,7 @@ def __init__(self, dim: int, vocab_size: int, weight_name: str, data_type: torch
     def _create_weight(self):
         tp_vocab_size = self.tp_vocab_end_id - self.tp_vocab_start_id
         self.weight: torch.Tensor = torch.empty(tp_vocab_size, self.dim, dtype=self.data_type_, device=self.device_id_)
-        self.load_cnt = 0
+        self.weight.load_ok = False
 
     def load_hf_weights(self, weights: Dict[str, torch.Tensor]):
         if self.weight_name not in weights:
@@ -39,12 +36,11 @@ def load_hf_weights(self, weights: Dict[str, torch.Tensor]):
         assert (
             loaded_vocab_size == self.vocab_size
         ), f"loaded weight vocab_size: {loaded_vocab_size} != expected vocab_size: {self.vocab_size}"
-        logger.info(f"loaded weight vocab_size: {self.vocab_size}")
         self.weight.copy_(t_weight[self.tp_vocab_start_id : self.tp_vocab_end_id, :].to(self.data_type_))
-        self.load_cnt += 1
+        self.weight.load_ok = True
 
     def verify_load(self):
-        return self.load_cnt == 1
+        return self.weight.load_ok
 
     def _native_forward(
         self, input_ids: torch.Tensor, out: Optional[torch.Tensor] = None, _alloc_func=torch.empty
@@ -114,12 +110,12 @@ def __init__(
         self._create_weight()
 
     def _create_weight(self):
-        self.load_cnt = 0
         if self._embedding_weight is not None:
             self.weight = self._embedding_weight.weight
             return
         tp_vocab_size = self.tp_vocab_end_id - self.tp_vocab_start_id
         self.weight: torch.Tensor = torch.empty(tp_vocab_size, self.dim, dtype=self.data_type_, device=self.device_id_)
+        self.weight.load_ok = False
 
     def load_hf_weights(self, weights: Dict[str, torch.Tensor]):
         # When set tile_embedding=True, no need to load - EmbeddingWeight already loaded it
@@ -132,12 +128,11 @@ def load_hf_weights(self, weights: Dict[str, torch.Tensor]):
         assert (
             loaded_vocab_size == self.vocab_size
         ), f"loaded weight vocab_size: {loaded_vocab_size} != expected vocab_size: {self.vocab_size}"
-        logger.info(f"loaded weight vocab_size: {self.vocab_size}")
         self.weight.copy_(t_weight[self.tp_vocab_start_id : self.tp_vocab_end_id, :].to(self.data_type_))
-        self.load_cnt += 1
+        self.weight.load_ok = True
 
     def verify_load(self):
-        return self.load_cnt == 1 or self._embedding_weight is not None
+        return self.weight.load_ok
 
     def _native_forward(
         self, input: torch.Tensor, out: Optional[torch.Tensor] = None, _alloc_func=torch.empty
@@ -181,7 +176,7 @@ def _create_weight(self):
         self.weight: torch.Tensor = torch.empty(
             self.max_position_embeddings, self.dim, dtype=self.data_type_, device=self.device_id_
         )
-        self.load_cnt = 0
+        self.weight.load_ok = False
 
     def load_hf_weights(self, weights: Dict[str, torch.Tensor]):
         if self.weight_name not in weights:
@@ -191,12 +186,11 @@ def load_hf_weights(self, weights: Dict[str, torch.Tensor]):
         assert (
             loaded_max_position_embeddings == self.max_position_embeddings
         ), f"max_position_embeddings: {loaded_max_position_embeddings} != expected: {self.max_position_embeddings}"
-        logger.info(f"loaded weight max_position_embeddings: {self.max_position_embeddings}")
         self.weight.copy_(t_weight.to(self.data_type_))
-        self.load_cnt += 1
+        self.weight.load_ok = True
 
     def verify_load(self):
-        return self.load_cnt == 1
+        return self.weight.load_ok
 
     def _native_forward(
         self, input_ids: torch.Tensor, out: Optional[torch.Tensor] = None, _alloc_func=torch.empty
diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/fused_moe_weight.py b/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/fused_moe_weight.py
new file mode 100644
index 000000000..8b01f4643
--- /dev/null
+++ b/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/fused_moe_weight.py
@@ -0,0 +1,315 @@
+import torch
+import threading
+from typing import Dict, Any, Optional, Tuple
+from lightllm.common.basemodel.layer_weights.meta_weights.base_weight import BaseWeightTpl
+from lightllm.common.quantization.quantize_method import WeightPack
+from lightllm.common.basemodel.layer_weights.meta_weights.mm_weight.mm_slicer import (
+    get_row_slice_mixin,
+    get_col_slice_mixin,
+)
+from lightllm.common.basemodel.layer_weights.meta_weights.fused_moe.impl import select_fuse_moe_impl
+from lightllm.common.quantization.quantize_method import QuantizationMethod
+from lightllm.utils.envs_utils import get_redundancy_expert_ids, get_redundancy_expert_num, get_env_start_args
+from lightllm.utils.dist_utils import get_global_world_size, get_global_rank
+
+
+class FusedMoeWeight(BaseWeightTpl):
+    def __init__(
+        self,
+        gate_proj_name: str,
+        down_proj_name: str,
+        up_proj_name: str,
+        e_score_correction_bias_name: str,
+        weight_prefix: str,
+        n_routed_experts: int,
+        hidden_size: int,
+        moe_intermediate_size: int,
+        data_type: torch.dtype,
+        quant_method: QuantizationMethod = None,
+        num_fused_shared_experts: int = 0,
+        layer_num: int = 0,
+        network_config: Dict[str, Any] = None,
+    ) -> None:
+        super().__init__(data_type=data_type)
+        self.w1_weight_name = gate_proj_name
+        self.w2_weight_name = down_proj_name
+        self.w3_weight_name = up_proj_name
+        self.e_score_correction_bias_name = e_score_correction_bias_name
+        self.weight_prefix = weight_prefix
+        self.layer_num_ = layer_num
+        self.global_rank_ = get_global_rank()
+        self.global_world_size = get_global_world_size()
+        self.hidden_size = hidden_size
+        self.moe_intermediate_size = moe_intermediate_size
+        self.quant_method = quant_method
+        self.row_slicer = get_row_slice_mixin(
+            self.quant_method.method_name, tp_rank=self.tp_rank_, tp_world_size=self.tp_world_size_
+        )
+        self.col_slicer = get_col_slice_mixin(
+            self.quant_method.method_name, tp_rank=self.tp_rank_, tp_world_size=self.tp_world_size_
+        )
+        assert num_fused_shared_experts in [0, 1], "num_fused_shared_experts can only support 0 or 1 now."
+        self.enable_ep_moe = get_env_start_args().enable_ep_moe
+        self.n_routed_experts = n_routed_experts
+        self.num_fused_shared_experts = num_fused_shared_experts
+        self._init_config(network_config)
+        self._init_parallel_params()
+        self.fuse_moe_impl = select_fuse_moe_impl(self.quant_method, self.enable_ep_moe)(
+            n_routed_experts=self.n_routed_experts,
+            num_fused_shared_experts=self.num_fused_shared_experts,
+            redundancy_expert_num=self.redundancy_expert_num,
+            routed_scaling_factor=self.routed_scaling_factor,
+            quant_method=self.quant_method,
+        )
+        self.lock = threading.Lock()
+        self._create_weight()
+
+    def _init_config(self, network_config: Dict[str, Any]):
+        self.n_group = network_config.get("n_group", 0)
+        self.use_grouped_topk = self.n_group > 0
+        self.norm_topk_prob = network_config["norm_topk_prob"]
+        self.topk_group = network_config.get("topk_group", 0)
+        self.num_experts_per_tok = network_config["num_experts_per_tok"]
+        self.routed_scaling_factor = network_config.get("routed_scaling_factor", 1.0)
+        self.scoring_func = network_config.get("scoring_func", "softmax")
+
+    def _init_parallel_params(self):
+        self.local_n_routed_experts = self.n_routed_experts + self.num_fused_shared_experts
+        self.start_expert_id = 0
+        self.split_inter_size = self.moe_intermediate_size // self.tp_world_size_
+        self.redundancy_expert_num = 0
+        if self.enable_ep_moe:
+            assert self.num_fused_shared_experts == 0, "num_fused_shared_experts must be 0 when enable_ep_moe"
+            self.redundancy_expert_num = get_redundancy_expert_num()
+            self.redundancy_expert_ids = get_redundancy_expert_ids(self.layer_num_)
+            self.local_n_routed_experts = self.n_routed_experts // self.global_world_size + self.redundancy_expert_num
+            self.start_expert_id = self.global_rank_ * self.n_routed_experts // self.global_world_size
+            self.split_inter_size = self.moe_intermediate_size
+
+    def experts(
+        self,
+        input_tensor: torch.Tensor,
+        router_logits: torch.Tensor,
+        top_k: int,
+        renormalize: bool,
+        use_grouped_topk: bool,
+        topk_group: int,
+        num_expert_group: int,
+        is_prefill: Optional[bool] = None,
+    ):
+        """Backward compatible method that routes to platform-specific implementation."""
+        return self.fuse_moe_impl(
+            input_tensor=input_tensor,
+            router_logits=router_logits,
+            w13=self.w13,
+            w2=self.w2,
+            correction_bias=self.e_score_correction_bias,
+            scoring_func=self.scoring_func,
+            top_k=top_k,
+            renormalize=renormalize,
+            use_grouped_topk=use_grouped_topk,
+            topk_group=topk_group,
+            num_expert_group=num_expert_group,
+            is_prefill=is_prefill,
+        )
+
+    def low_latency_dispatch(
+        self,
+        hidden_states: torch.Tensor,
+        router_logits: torch.Tensor,
+    ):
+        assert self.enable_ep_moe, "low_latency_dispatch is only supported when enable_ep_moe is True"
+        return self.fuse_moe_impl.low_latency_dispatch(
+            hidden_states=hidden_states,
+            router_logits=router_logits,
+            e_score_correction_bias=self.e_score_correction_bias,
+            use_grouped_topk=self.use_grouped_topk,
+            num_experts_per_tok=self.num_experts_per_tok,
+            norm_topk_prob=self.norm_topk_prob,
+            topk_group=self.topk_group,
+            n_group=self.n_group,
+            scoring_func=self.scoring_func,
+        )
+
+    def select_experts_and_quant_input(
+        self,
+        hidden_states: torch.Tensor,
+        router_logits: torch.Tensor,
+    ):
+        assert self.enable_ep_moe, "select_experts_and_quant_input is only supported when enable_ep_moe is True"
+        return self.fuse_moe_impl.select_experts_and_quant_input(
+            hidden_states=hidden_states,
+            router_logits=router_logits,
+            e_score_correction_bias=self.e_score_correction_bias,
+            w13=self.w13,
+            use_grouped_topk=self.use_grouped_topk,
+            num_experts_per_tok=self.num_experts_per_tok,
+            norm_topk_prob=self.norm_topk_prob,
+            topk_group=self.topk_group,
+            n_group=self.n_group,
+            scoring_func=self.scoring_func,
+        )
+
+    def dispatch(
+        self,
+        qinput_tensor: Tuple[torch.Tensor],
+        topk_idx: torch.Tensor,
+        topk_weights: torch.Tensor,
+        overlap_event: Optional[Any] = None,
+    ):
+        assert self.enable_ep_moe, "dispatch is only supported when enable_ep_moe is True"
+        return self.fuse_moe_impl.dispatch(
+            qinput_tensor=qinput_tensor,
+            topk_idx=topk_idx,
+            topk_weights=topk_weights,
+            overlap_event=overlap_event,
+        )
+
+    def masked_group_gemm(
+        self, recv_x: Tuple[torch.Tensor], masked_m: torch.Tensor, dtype: torch.dtype, expected_m: int
+    ):
+        assert self.enable_ep_moe, "masked_group_gemm is only supported when enable_ep_moe is True"
+        return self.fuse_moe_impl.masked_group_gemm(
+            recv_x=recv_x,
+            w13=self.w13,
+            w2=self.w2,
+            masked_m=masked_m,
+            dtype=dtype,
+            expected_m=expected_m,
+        )
+
+    def prefilled_group_gemm(
+        self,
+        num_recv_tokens_per_expert_list,
+        recv_x: Tuple[torch.Tensor],
+        recv_topk_idx: torch.Tensor,
+        recv_topk_weights: torch.Tensor,
+        hidden_dtype=torch.bfloat16,
+    ):
+        assert self.enable_ep_moe, "prefilled_group_gemm is only supported when enable_ep_moe is True"
+        return self.fuse_moe_impl.prefilled_group_gemm(
+            num_recv_tokens_per_expert_list=num_recv_tokens_per_expert_list,
+            recv_x=recv_x,
+            recv_topk_idx=recv_topk_idx,
+            recv_topk_weights=recv_topk_weights,
+            w13=self.w13,
+            w2=self.w2,
+            hidden_dtype=hidden_dtype,
+        )
+
+    def low_latency_combine(
+        self,
+        gemm_out_b: torch.Tensor,
+        topk_idx: torch.Tensor,
+        topk_weights: torch.Tensor,
+        handle: Any,
+    ):
+        assert self.enable_ep_moe, "low_latency_combine is only supported when enable_ep_moe is True"
+        return self.fuse_moe_impl.low_latency_combine(
+            gemm_out_b=gemm_out_b,
+            topk_idx=topk_idx,
+            topk_weights=topk_weights,
+            handle=handle,
+        )
+
+    def combine(
+        self,
+        gemm_out_b: torch.Tensor,
+        handle: Any,
+        overlap_event: Optional[Any] = None,
+    ):
+        assert self.enable_ep_moe, "combine is only supported when enable_ep_moe is True"
+        return self.fuse_moe_impl.combine(
+            gemm_out_b=gemm_out_b,
+            handle=handle,
+            overlap_event=overlap_event,
+        )
+
+    def load_hf_weights(self, weights):
+        # Load bias
+        if self.e_score_correction_bias_name in weights:
+            self.e_score_correction_bias.copy_(weights[self.e_score_correction_bias_name])
+
+        # Load each expert with TP slicing
+        for i_experts in range(self.start_expert_id, self.start_expert_id + self.local_n_routed_experts):
+            with self.lock:
+                self._load_expert(i_experts, weights, type="weight", suffix=self.quant_method.weight_suffix)
+            if self.w13.weight_scale is not None:
+                with self.lock:
+                    self._load_expert(
+                        i_experts, weights, type="weight_scale", suffix=self.quant_method.weight_scale_suffix
+                    )
+            if self.w13.weight_zero_point is not None:
+                with self.lock:
+                    self._load_expert(
+                        i_experts, weights, type="weight_zero_point", suffix=self.quant_method.weight_zero_point_suffix
+                    )
+
+    def verify_load(self):
+        return True
+        return self.load_cnt == self.n_routed_experts * 3 * 2
+
+    def _create_weight(self):
+        intermediate_size = self.split_inter_size
+        self.e_score_correction_bias = None
+        # Create e_score_correction_bias
+        if self.e_score_correction_bias_name:
+            self.e_score_correction_bias = torch.empty(
+                (self.n_routed_experts,),
+                dtype=self.data_type_,
+                device=f"cuda:{self.device_id_}",
+            )
+
+        self.w13: WeightPack = self.quant_method.create_weight(
+            out_dim=intermediate_size * 2,
+            in_dim=self.hidden_size,
+            dtype=self.data_type_,
+            device_id=self.device_id_,
+            num_experts=self.local_n_routed_experts,
+        )
+        self.w2: WeightPack = self.quant_method.create_weight(
+            out_dim=self.hidden_size,
+            in_dim=intermediate_size,
+            dtype=self.data_type_,
+            device_id=self.device_id_,
+            num_experts=self.local_n_routed_experts,
+        )
+        self.load_cnt = 0
+
+    def _load_weight_func(self, weight: torch.Tensor, weight_pack: WeightPack, start_idx: int = 0):
+        if self.quant_method.weight_need_quanted(weight):
+            self.quant_method.quantize(weight, weight_pack, start_idx)
+        else:
+            self.quant_method.load_weight(weight, weight_pack, start_idx)
+
+    def _load_expert(self, expert_idx, weights, type: str, suffix: str = "weight"):
+        w1_weight = f"{self.weight_prefix}.{expert_idx}.{self.w1_weight_name}.{suffix}"
+        w2_weight = f"{self.weight_prefix}.{expert_idx}.{self.w2_weight_name}.{suffix}"
+        w3_weight = f"{self.weight_prefix}.{expert_idx}.{self.w3_weight_name}.{suffix}"
+        intermediate_size = self.split_inter_size
+        load_func, slice_func = self._get_load_and_slice_func(type, is_row=True)
+        local_expert_idx = expert_idx - self.start_expert_id
+        if w1_weight in weights:
+            load_func(slice_func(weights[w1_weight]), self.w13.get_expert(local_expert_idx), start_idx=0)
+            self.load_cnt += 1
+        if w3_weight in weights:
+            load_func(
+                slice_func(weights[w3_weight]), self.w13.get_expert(local_expert_idx), start_idx=intermediate_size
+            )
+            self.load_cnt += 1
+        load_func, slice_func = self._get_load_and_slice_func(type, is_row=False)
+        if w2_weight in weights:
+            load_func(slice_func(weights[w2_weight]), self.w2.get_expert(local_expert_idx), start_idx=0)
+            self.load_cnt += 1
+
+    def _get_load_and_slice_func(self, type: str, is_row: bool = True):
+        if is_row:
+            slicer = self.row_slicer
+        else:
+            slicer = self.col_slicer
+        if type == "weight":
+            return self._load_weight_func, slicer._slice_weight
+        elif type == "weight_scale":
+            return getattr(self.quant_method, "load_weight_scale"), slicer._slice_weight_scale
+        elif type == "weight_zero_point":
+            return getattr(self.quant_method, "load_weight_zero_point"), slicer._slice_weight_zero_point
diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/fused_moe_weight_ep.py b/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/fused_moe_weight_ep.py
index 342026de2..6659a98d4 100644
--- a/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/fused_moe_weight_ep.py
+++ b/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/fused_moe_weight_ep.py
@@ -4,14 +4,14 @@
 from lightllm.utils.dist_utils import get_global_world_size, get_global_rank
 from lightllm.common.basemodel.layer_weights.meta_weights.base_weight import BaseWeightTpl
 from lightllm.common.basemodel.layer_weights.meta_weights.platform_op import PlatformAwareOp
-from lightllm.common.fused_moe.grouped_fused_moe_ep import (
+from lightllm.common.basemodel.triton_kernel.fused_moe.grouped_fused_moe_ep import (
     fused_experts_impl,
     masked_group_gemm,
     _deepgemm_grouped_fp8_nt_contiguous,
 )
-from lightllm.common.fused_moe.moe_silu_and_mul import silu_and_mul_fwd
+from lightllm.common.basemodel.triton_kernel.fused_moe.moe_silu_and_mul import silu_and_mul_fwd
 from lightllm.distributed import dist_group_manager
-from lightllm.common.fused_moe.topk_select import select_experts
+from lightllm.common.basemodel.triton_kernel.fused_moe.topk_select import select_experts
 from lightllm.utils.envs_utils import get_deepep_num_max_dispatch_tokens_per_rank
 from lightllm.utils.envs_utils import get_redundancy_expert_ids, get_redundancy_expert_num
 from lightllm.utils.envs_utils import get_env_start_args
@@ -19,7 +19,7 @@
     per_token_group_quant_fp8,
     tma_align_input_scale,
 )
-from lightllm.common.fused_moe.deepep_scatter_gather import ep_scatter, ep_gather
+from lightllm.common.basemodel.triton_kernel.fused_moe.deepep_scatter_gather import ep_scatter, ep_gather
 from lightllm.common.basemodel.triton_kernel.redundancy_topk_ids_repair import redundancy_topk_ids_repair
 from lightllm.utils.log_utils import init_logger
 from lightllm.common.triton_utils.autotuner import Autotuner
@@ -185,57 +185,6 @@ def _select_experts(
             )
         return topk_weights, topk_ids
 
-    def _native_forward(
-        self,
-        input_tensor,
-        router_logits,
-        top_k,
-        renormalize,
-        use_grouped_topk,
-        topk_group,
-        num_expert_group,
-        is_prefill,
-    ):
-        """PyTorch native implementation for EP MoE forward pass."""
-        topk_weights, topk_ids = self._select_experts(
-            input_tensor, router_logits, top_k, renormalize, use_grouped_topk, topk_group, num_expert_group
-        )
-
-        w1, w1_scale = self.w1
-        w2, w2_scale = self.w2
-
-        # Native PyTorch implementation (less optimized but works on all platforms)
-        batch_size, hidden_size = input_tensor.shape
-        intermediate_size = w1.shape[1] // 2
-
-        output = torch.zeros_like(input_tensor)
-
-        for i in range(batch_size):
-            expert_output = torch.zeros(hidden_size, dtype=input_tensor.dtype, device=input_tensor.device)
-            for j in range(top_k):
-                expert_idx = topk_ids[i, j].item()
-                weight = topk_weights[i, j]
-
-                # Get local expert index (EP mode uses local expert indices)
-                local_expert_idx = expert_idx % self.ep_load_expert_num
-
-                # Get expert weights
-                w1_expert = w1[local_expert_idx, :intermediate_size, :]  # gate
-                w3_expert = w1[local_expert_idx, intermediate_size:, :]  # up
-                w2_expert = w2[local_expert_idx]
-
-                # Compute: SiLU(x @ w1.T) * (x @ w3.T) @ w2.T
-                x = input_tensor[i : i + 1]
-                gate = torch.nn.functional.silu(torch.mm(x, w1_expert.T))
-                up = torch.mm(x, w3_expert.T)
-                hidden = gate * up
-                expert_out = torch.mm(hidden, w2_expert.T)
-                expert_output += weight * expert_out.squeeze(0)
-
-            output[i] = expert_output
-
-        return output
-
     def _cuda_forward(
         self,
         input_tensor,
diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/fused_moe_weight_ep_redundancy.py b/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/fused_moe_weight_ep_redundancy.py
index 933a94f78..a31cd1880 100644
--- a/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/fused_moe_weight_ep_redundancy.py
+++ b/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/fused_moe_weight_ep_redundancy.py
@@ -1,6 +1,6 @@
 import numpy as np
 import torch
-from .fused_moe_weight_ep import FusedMoeWeightEP
+from .fused_moe_weight import FusedMoeWeight
 from lightllm.utils.log_utils import init_logger
 from typing import Dict
 
@@ -10,7 +10,7 @@
 class FusedMoeWeightEPAutoRedundancy:
     def __init__(
         self,
-        ep_fused_moe_weight: FusedMoeWeightEP,
+        ep_fused_moe_weight: FusedMoeWeight,
     ) -> None:
         super().__init__()
         self._ep_w = ep_fused_moe_weight
diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/fused_moe_weight_tp.py b/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/fused_moe_weight_tp.py
deleted file mode 100644
index c7892ab3b..000000000
--- a/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/fused_moe_weight_tp.py
+++ /dev/null
@@ -1,364 +0,0 @@
-import torch
-from typing import Dict, Any, Union
-from lightllm.common.basemodel.layer_weights.meta_weights.base_weight import BaseWeightTpl
-from lightllm.common.basemodel.layer_weights.meta_weights.platform_op import PlatformAwareOp
-from lightllm.common.quantization import Quantcfg
-from lightllm.common.quantization.quantize_method import WeightPack
-from lightllm.common.basemodel.layer_weights.meta_weights.mm_weight.mm_slicer import (
-    get_row_slice_mixin,
-    get_col_slice_mixin,
-)
-import threading
-
-
-def create_tp_moe_wegiht_obj(
-    gate_proj_name: str,
-    down_proj_name: str,
-    up_proj_name: str,
-    e_score_correction_bias_name: str,
-    weight_prefix: str,
-    n_routed_experts: int,
-    num_fused_shared_experts: int,
-    split_inter_size: int,
-    data_type: torch.dtype,
-    network_config: Dict[str, Any],
-    layer_num: int,
-    quant_cfg: Quantcfg = None,
-) -> Union["FusedMoeWeightTP", "FusedAWQMARLINMoeWeightTP"]:
-    quant_method = quant_cfg.get_quant_method(layer_num, "fused_moe")
-    if quant_method is not None and quant_method.method_name == "awq_marlin":
-        return FusedAWQMARLINMoeWeightTP(
-            gate_proj_name=gate_proj_name,
-            down_proj_name=down_proj_name,
-            up_proj_name=up_proj_name,
-            e_score_correction_bias_name=e_score_correction_bias_name,
-            weight_prefix=weight_prefix,
-            n_routed_experts=n_routed_experts,
-            num_fused_shared_experts=num_fused_shared_experts,
-            split_inter_size=split_inter_size,
-            data_type=data_type,
-            network_config=network_config,
-            layer_num=layer_num,
-            quant_cfg=quant_cfg,
-        )
-    else:
-        return FusedMoeWeightTP(
-            gate_proj_name=gate_proj_name,
-            down_proj_name=down_proj_name,
-            up_proj_name=up_proj_name,
-            e_score_correction_bias_name=e_score_correction_bias_name,
-            weight_prefix=weight_prefix,
-            n_routed_experts=n_routed_experts,
-            num_fused_shared_experts=num_fused_shared_experts,
-            split_inter_size=split_inter_size,
-            data_type=data_type,
-            network_config=network_config,
-            layer_num=layer_num,
-            quant_cfg=quant_cfg,
-        )
-
-
-class FusedMoeWeightTP(BaseWeightTpl, PlatformAwareOp):
-    def __init__(
-        self,
-        gate_proj_name: str,
-        down_proj_name: str,
-        up_proj_name: str,
-        e_score_correction_bias_name: str,
-        weight_prefix: str,
-        n_routed_experts: int,
-        num_fused_shared_experts: int,
-        split_inter_size: int,
-        data_type: torch.dtype,
-        network_config: Dict[str, Any],
-        layer_num: int,
-        quant_cfg: Quantcfg = None,
-    ) -> None:
-        super().__init__()
-        self.quant_method = quant_cfg.get_quant_method(layer_num, "fused_moe")
-        self.quantized_weight = quant_cfg.quantized_weight
-        if self.quant_method.method_name != "none":
-            self.weight_scale_suffix = self.quant_method.weight_scale_suffix
-
-        self.w1_weight_name = gate_proj_name
-        self.w2_weight_name = down_proj_name
-        self.w3_weight_name = up_proj_name
-
-        self.e_score_correction_bias_name = e_score_correction_bias_name
-        self.weight_prefix = weight_prefix
-        assert num_fused_shared_experts in [0, 1], "num_fused_shared_experts can only support 0 or 1 now."
-        self.n_routed_experts = n_routed_experts + num_fused_shared_experts
-        self.num_fused_shared_experts = num_fused_shared_experts
-        self.routed_scaling_factor = network_config.get("routed_scaling_factor", 1.0)
-        self.split_inter_size = split_inter_size
-        self.data_type_ = data_type
-        self.hidden_size = network_config.get("hidden_size")
-        self.e_score_correction_bias = None
-        self.scoring_func = network_config.get("scoring_func", "softmax")
-        self.row_slicer = get_row_slice_mixin(
-            self.quant_method.method_name, tp_rank=self.tp_rank_, tp_world_size=self.tp_world_size_
-        )
-        self.col_slicer = get_col_slice_mixin(
-            self.quant_method.method_name, tp_rank=self.tp_rank_, tp_world_size=self.tp_world_size_
-        )
-        self.lock = threading.Lock()
-        self._create_weight()
-
-    def _create_weight(self):
-        total_expert_num = self.n_routed_experts
-        intermediate_size = self.split_inter_size
-
-        # Create e_score_correction_bias
-        if self.e_score_correction_bias_name is not None:
-            self.e_score_correction_bias = torch.empty(
-                (total_expert_num,),
-                dtype=self.data_type_,
-                device=f"cuda:{self.device_id_}",
-            )
-
-        self.w13: WeightPack = self.quant_method.create_weight(
-            out_dim=intermediate_size * 2,
-            in_dim=self.hidden_size,
-            dtype=self.data_type_,
-            device_id=self.device_id_,
-            num_experts=total_expert_num,
-        )
-        self.w2: WeightPack = self.quant_method.create_weight(
-            out_dim=self.hidden_size,
-            in_dim=intermediate_size,
-            dtype=self.data_type_,
-            device_id=self.device_id_,
-            num_experts=total_expert_num,
-        )
-        self.load_cnt = 0
-
-    def _select_experts(
-        self, input_tensor, router_logits, top_k, renormalize, use_grouped_topk, topk_group, num_expert_group
-    ):
-        """Select experts and return topk weights and ids."""
-        from lightllm.common.fused_moe.topk_select import select_experts
-
-        topk_weights, topk_ids = select_experts(
-            hidden_states=input_tensor,
-            router_logits=router_logits,
-            correction_bias=self.e_score_correction_bias,
-            use_grouped_topk=use_grouped_topk,
-            top_k=top_k,
-            renormalize=renormalize,
-            topk_group=topk_group,
-            num_expert_group=num_expert_group,
-            scoring_func=self.scoring_func,
-        )
-        topk_weights.mul_(self.routed_scaling_factor)
-        if self.num_fused_shared_experts > 0:
-            pad_topk_ids = (
-                torch.arange(
-                    start=self.n_routed_experts - self.num_fused_shared_experts,
-                    end=self.n_routed_experts,
-                    step=1,
-                    dtype=topk_ids.dtype,
-                    device="cuda",
-                )
-                .view(1, self.num_fused_shared_experts)
-                .repeat(topk_ids.shape[0], 1)
-            )
-            pad_topk_weights = torch.full(
-                (topk_weights.shape[0], self.num_fused_shared_experts),
-                fill_value=1.0,
-                device="cuda",
-                dtype=topk_weights.dtype,
-            )
-
-            topk_ids = torch.cat([topk_ids, pad_topk_ids], dim=1)
-            topk_weights = torch.cat([topk_weights, pad_topk_weights], dim=1)
-        return topk_weights, topk_ids
-
-    def _native_forward(
-        self, input_tensor, router_logits, top_k, renormalize, use_grouped_topk, topk_group, num_expert_group
-    ):
-        topk_weights, topk_ids = self._select_experts(
-            input_tensor, router_logits, top_k, renormalize, use_grouped_topk, topk_group, num_expert_group
-        )
-
-        w13, _ = self.w13.weight, self.w13.weight_scale
-        w2, _ = self.w2.weight, self.w2.weight_scale
-
-        batch_size, hidden_size = input_tensor.shape
-        intermediate_size = w13.shape[1] // 2
-
-        output = torch.zeros_like(input_tensor)
-
-        for i in range(batch_size):
-            expert_output = torch.zeros(hidden_size, dtype=input_tensor.dtype, device=input_tensor.device)
-            for j in range(top_k):
-                expert_idx = topk_ids[i, j].item()
-                weight = topk_weights[i, j]
-
-                w1 = w13[expert_idx, :intermediate_size, :]  # gate
-                w3 = w13[expert_idx, intermediate_size:, :]  # up
-                w2_expert = w2[expert_idx]
-
-                # Compute: SiLU(x @ w1.T) * (x @ w3.T) @ w2.T
-                x = input_tensor[i : i + 1]
-                gate = torch.nn.functional.silu(torch.mm(x, w1.T))
-                up = torch.mm(x, w3.T)
-                hidden = gate * up
-                expert_out = torch.mm(hidden, w2_expert.T)
-                expert_output += weight * expert_out.squeeze(0)
-
-            output[i] = expert_output
-
-        input_tensor.copy_(output)
-        return
-
-    def _cuda_forward(
-        self, input_tensor, router_logits, top_k, renormalize, use_grouped_topk, topk_group, num_expert_group
-    ):
-        """CUDA optimized implementation of MoE forward pass."""
-        topk_weights, topk_ids = self._select_experts(
-            input_tensor, router_logits, top_k, renormalize, use_grouped_topk, topk_group, num_expert_group
-        )
-
-        w13, w13_scale = self.w13.weight, self.w13.weight_scale
-        w2, w2_scale = self.w2.weight, self.w2.weight_scale
-        use_fp8_w8a8 = self.quant_method.method_name != "none"
-
-        from lightllm.common.fused_moe.grouped_fused_moe import fused_experts
-
-        fused_experts(
-            hidden_states=input_tensor,
-            w1=w13,
-            w2=w2,
-            topk_weights=topk_weights,
-            topk_ids=topk_ids,
-            inplace=True,
-            use_fp8_w8a8=use_fp8_w8a8,
-            w1_scale=w13_scale,
-            w2_scale=w2_scale,
-        )
-        return
-
-    def experts(self, input_tensor, router_logits, top_k, renormalize, use_grouped_topk, topk_group, num_expert_group):
-        """Backward compatible method that routes to platform-specific implementation."""
-        return self._forward(
-            input_tensor=input_tensor,
-            router_logits=router_logits,
-            top_k=top_k,
-            renormalize=renormalize,
-            use_grouped_topk=use_grouped_topk,
-            topk_group=topk_group,
-            num_expert_group=num_expert_group,
-        )
-
-    def load_hf_weights(self, weights):
-        # Load bias
-        if self.e_score_correction_bias_name in weights:
-            self.e_score_correction_bias.copy_(weights[self.e_score_correction_bias_name])
-
-        # Load each expert with TP slicing
-        for i_experts in range(self.n_routed_experts):
-            with self.lock:
-                self._load_expert(i_experts, weights, type="weight", suffix=self.quant_method.weight_suffix)
-            if self.w13.weight_scale is not None:
-                with self.lock:
-                    self._load_expert(
-                        i_experts, weights, type="weight_scale", suffix=self.quant_method.weight_scale_suffix
-                    )
-            if self.w13.weight_zero_point is not None:
-                with self.lock:
-                    self._load_expert(
-                        i_experts, weights, type="weight_zero_point", suffix=self.quant_method.weight_zero_point_suffix
-                    )
-
-    def _load_weight_func(self, weight: torch.Tensor, weight_pack: WeightPack, start_idx: int = 0):
-        if self.quant_method.weight_need_quanted(weight):
-            self.quant_method.quantize(weight, weight_pack, start_idx)
-        else:
-            self.quant_method.load_weight(weight, weight_pack, start_idx)
-
-    def _load_expert(self, expert_idx, weights, type: str, suffix: str = "weight"):
-        w1_weight = f"{self.weight_prefix}.{expert_idx}.{self.w1_weight_name}.{suffix}"
-        w2_weight = f"{self.weight_prefix}.{expert_idx}.{self.w2_weight_name}.{suffix}"
-        w3_weight = f"{self.weight_prefix}.{expert_idx}.{self.w3_weight_name}.{suffix}"
-        intermediate_size = self.split_inter_size
-        load_func, slice_func = self._get_load_and_slice_func(type, is_row=True)
-        if w1_weight in weights:
-            load_func(slice_func(weights[w1_weight]), self.w13.get_expert(expert_idx), start_idx=0)
-            self.load_cnt += 1
-        if w3_weight in weights:
-            load_func(slice_func(weights[w3_weight]), self.w13.get_expert(expert_idx), start_idx=intermediate_size)
-            self.load_cnt += 1
-        load_func, slice_func = self._get_load_and_slice_func(type, is_row=False)
-        if w2_weight in weights:
-            load_func(slice_func(weights[w2_weight]), self.w2.get_expert(expert_idx), start_idx=0)
-            self.load_cnt += 1
-
-    def verify_load(self):
-        return self.load_cnt == self.n_routed_experts * 3 * 2
-
-    def _get_load_and_slice_func(self, type: str, is_row: bool = True):
-        if is_row:
-            slicer = self.row_slicer
-        else:
-            slicer = self.col_slicer
-        if type == "weight":
-            return self._load_weight_func, slicer._slice_weight
-        elif type == "weight_scale":
-            return getattr(self.quant_method, "load_weight_scale"), slicer._slice_weight_scale
-        elif type == "weight_zero_point":
-            return getattr(self.quant_method, "load_weight_zero_point"), slicer._slice_weight_zero_point
-
-
-class FusedAWQMARLINMoeWeightTP(FusedMoeWeightTP):
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        from lightllm.utils.vllm_utils import HAS_VLLM, vllm_ops
-
-        assert HAS_VLLM, "moe awq marlin quantization requires kernels of vllm"
-        from vllm.model_executor.layers.quantization.utils.marlin_utils import (
-            marlin_make_workspace_new,
-        )
-
-        self.workspace = marlin_make_workspace_new(self.w13.weight.device, 4)
-
-    def _native_forward(
-        self, input_tensor, router_logits, top_k, renormalize, use_grouped_topk, topk_group, num_expert_group
-    ):
-        """AWQ Marlin quantization requires CUDA, native forward not supported."""
-        raise NotImplementedError("AWQ Marlin MoE requires CUDA platform, native forward not supported.")
-
-    def _cuda_forward(
-        self, input_tensor, router_logits, top_k, renormalize, use_grouped_topk, topk_group, num_expert_group
-    ):
-        """CUDA optimized implementation using AWQ Marlin kernels."""
-        topk_weights, topk_ids = self._select_experts(
-            input_tensor, router_logits, top_k, renormalize, use_grouped_topk, topk_group, num_expert_group
-        )
-
-        w1, w1_scale, w1_zero_point = self.w13.weight, self.w13.weight_scale, self.w13.weight_zero_point
-        w2, w2_scale, w2_zero_point = self.w2.weight, self.w2.weight_scale, self.w2.weight_zero_point
-
-        from vllm.model_executor.layers.fused_moe.fused_marlin_moe import fused_marlin_moe
-
-        fused_marlin_moe(
-            input_tensor,
-            w1,
-            w2,
-            None,
-            None,
-            w1_scale,
-            w2_scale,
-            router_logits,
-            topk_weights,
-            topk_ids,
-            quant_type_id=self.quant_method.vllm_quant_type.id,
-            apply_router_weight_on_input=False,
-            global_num_experts=-1,
-            expert_map=None,
-            w1_zeros=w1_zero_point,
-            w2_zeros=w2_zero_point,
-            workspace=self.workspace,
-            inplace=True,
-        )
-
-        return
diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/gpt_oss_fused_moe_weight_tp.py b/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/gpt_oss_fused_moe_weight_tp.py
index 9821b5ad6..f3f153b0a 100644
--- a/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/gpt_oss_fused_moe_weight_tp.py
+++ b/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/gpt_oss_fused_moe_weight_tp.py
@@ -1,9 +1,9 @@
 import torch
 from typing import Dict, Any
 
-from lightllm.common.basemodel.layer_weights.meta_weights.fused_moe.fused_moe_weight_tp import FusedMoeWeightTP
-from lightllm.common.quantization import Quantcfg
+from lightllm.common.basemodel.layer_weights.meta_weights.fused_moe.fused_moe_weight import FusedMoeWeight
 from lightllm.utils.log_utils import init_logger
+from lightllm.common.quantization.quantize_method import QuantizationMethod
 
 logger = init_logger(__name__)
 
@@ -27,7 +27,7 @@
 ]
 
 
-class GPTOSSFusedMoeWeightTP(FusedMoeWeightTP):
+class GPTOSSFusedMoeWeightTP(FusedMoeWeight):
     def __init__(
         self,
         gate_up_proj_name: str,  # diff with FusedMoeWeightTP
@@ -41,7 +41,7 @@ def __init__(
         network_config: Dict[str, Any],
         layer_num: int,
         world_size: int = 1,  # diff with FusedMoeWeightTP
-        quant_cfg: Quantcfg = None,
+        quant_method: QuantizationMethod = None,
     ) -> None:
         super().__init__(
             gate_up_proj_name,
@@ -55,7 +55,7 @@ def __init__(
             data_type,
             network_config,
             layer_num,
-            quant_cfg,
+            quant_method,
         )
         self.hidden_size = network_config["hidden_size"]
 
diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/impl/__init__.py b/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/impl/__init__.py
new file mode 100644
index 000000000..67bb90e4e
--- /dev/null
+++ b/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/impl/__init__.py
@@ -0,0 +1,14 @@
+from lightllm.common.quantization.quantize_method import QuantizationMethod
+from .triton_impl import FuseMoeTriton
+from .marlin_impl import FuseMoeMarlin
+from .deepgemm_impl import FuseMoeDeepGEMM
+
+
+def select_fuse_moe_impl(quant_method: QuantizationMethod, enable_ep_moe: bool):
+    if enable_ep_moe:
+        return FuseMoeDeepGEMM
+
+    if quant_method.method_name == "awq_marlin":
+        return FuseMoeMarlin
+    else:
+        return FuseMoeTriton
diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/impl/base_impl.py b/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/impl/base_impl.py
new file mode 100644
index 000000000..2f5d169eb
--- /dev/null
+++ b/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/impl/base_impl.py
@@ -0,0 +1,55 @@
+import torch
+from abc import abstractmethod
+from lightllm.common.quantization.quantize_method import (
+    WeightPack,
+    QuantizationMethod,
+)
+from typing import Optional
+from lightllm.utils.dist_utils import (
+    get_global_rank,
+    get_global_world_size,
+)
+
+
+class FuseMoeBaseImpl:
+    def __init__(
+        self,
+        n_routed_experts: int,
+        num_fused_shared_experts: int,
+        redundancy_expert_num: int,
+        routed_scaling_factor: float,
+        quant_method: QuantizationMethod,
+    ):
+        self.n_routed_experts = n_routed_experts
+        self.num_fused_shared_experts = num_fused_shared_experts
+        self.redundancy_expert_num = redundancy_expert_num
+        self.routed_scaling_factor = routed_scaling_factor
+        self.quant_method = quant_method
+        self.global_rank_ = get_global_rank()
+        self.global_world_size = get_global_world_size()
+        self.total_expert_num_contain_redundancy = (
+            self.n_routed_experts + self.redundancy_expert_num * self.global_world_size
+        )
+        self.workspace = self.create_workspace()
+
+    @abstractmethod
+    def create_workspace(self):
+        pass
+
+    @abstractmethod
+    def __call__(
+        self,
+        input_tensor: torch.Tensor,
+        router_logits: torch.Tensor,
+        w13: WeightPack,
+        w2: WeightPack,
+        correction_bias: Optional[torch.Tensor],
+        scoring_func: str,
+        top_k: int,
+        renormalize: bool,
+        use_grouped_topk: bool,
+        topk_group: int,
+        num_expert_group: int,
+        is_prefill: Optional[bool] = None,
+    ):
+        pass
diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/impl/deepgemm_impl.py b/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/impl/deepgemm_impl.py
new file mode 100644
index 000000000..f00d572d9
--- /dev/null
+++ b/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/impl/deepgemm_impl.py
@@ -0,0 +1,336 @@
+import torch
+from typing import Optional, Tuple, Any
+from .triton_impl import FuseMoeTriton
+from lightllm.distributed import dist_group_manager
+from lightllm.common.triton_utils.autotuner import Autotuner
+from lightllm.common.quantization.quantize_method import WeightPack
+from lightllm.utils.envs_utils import get_deepep_num_max_dispatch_tokens_per_rank
+from lightllm.common.basemodel.triton_kernel.fused_moe.grouped_fused_moe_ep import (
+    fused_experts_impl,
+    masked_group_gemm,
+    _deepgemm_grouped_fp8_nt_contiguous,
+)
+from lightllm.common.basemodel.triton_kernel.quantization.fp8act_quant_kernel import (
+    per_token_group_quant_fp8,
+    tma_align_input_scale,
+)
+from lightllm.common.basemodel.triton_kernel.fused_moe.deepep_scatter_gather import ep_scatter, ep_gather
+from lightllm.common.basemodel.triton_kernel.fused_moe.moe_silu_and_mul import silu_and_mul_fwd
+from lightllm.common.basemodel.triton_kernel.redundancy_topk_ids_repair import redundancy_topk_ids_repair
+
+
+class FuseMoeDeepGEMM(FuseMoeTriton):
+    def _select_experts(
+        self,
+        input_tensor: torch.Tensor,
+        router_logits: torch.Tensor,
+        correction_bias: Optional[torch.Tensor],
+        top_k: int,
+        renormalize: bool,
+        use_grouped_topk: bool,
+        topk_group: int,
+        num_expert_group: int,
+        scoring_func: str,
+    ):
+        """Select experts and return topk weights and ids."""
+        from lightllm.common.basemodel.triton_kernel.fused_moe.topk_select import select_experts
+
+        topk_weights, topk_ids = select_experts(
+            hidden_states=input_tensor,
+            router_logits=router_logits,
+            correction_bias=correction_bias,
+            use_grouped_topk=use_grouped_topk,
+            top_k=top_k,
+            renormalize=renormalize,
+            topk_group=topk_group,
+            num_expert_group=num_expert_group,
+            scoring_func=scoring_func,
+        )
+        topk_weights.mul_(self.routed_scaling_factor)
+        if self.redundancy_expert_num > 0:
+            redundancy_topk_ids_repair(
+                topk_ids=topk_ids,
+                redundancy_expert_ids=self.redundancy_expert_ids_tensor,
+                ep_expert_num=self.ep_n_routed_experts,
+                global_rank=self.global_rank_,
+                expert_counter=self.routed_expert_counter_tensor,
+                enable_counter=self.auto_update_redundancy_expert,
+            )
+        return topk_weights, topk_ids
+
+    def _fused_experts(
+        self,
+        input_tensor: torch.Tensor,
+        w13: WeightPack,
+        w2: WeightPack,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        router_logits: Optional[torch.Tensor] = None,
+        is_prefill: Optional[bool] = None,
+    ):
+
+        w13_weight, w13_scale = w13.weight, w13.weight_scale
+        w2_weight, w2_scale = w2.weight, w2.weight_scale
+        use_fp8_w8a8 = self.quant_method.method_name != "none"
+        output = fused_experts_impl(
+            hidden_states=input_tensor,
+            w1=w13_weight,
+            w2=w2_weight,
+            topk_weights=topk_weights,
+            topk_idx=topk_ids.to(torch.long),
+            num_experts=self.total_expert_num_contain_redundancy,  # number of all experts contain redundancy
+            buffer=dist_group_manager.ep_buffer,
+            is_prefill=is_prefill,
+            use_fp8_w8a8=use_fp8_w8a8,
+            use_fp8_all2all=use_fp8_w8a8,
+            use_int8_w8a16=False,  # default to False
+            w1_scale=w13_scale,
+            w2_scale=w2_scale,
+            previous_event=None,  # for overlap
+        )
+        return output
+
+    def low_latency_dispatch(
+        self,
+        hidden_states: torch.Tensor,
+        router_logits: torch.Tensor,
+        e_score_correction_bias: torch.Tensor,
+        use_grouped_topk: bool,
+        num_experts_per_tok: int,
+        norm_topk_prob: bool,
+        topk_group: int,
+        n_group: int,
+        scoring_func: str,
+    ):
+        topk_weights, topk_idx = self._select_experts(
+            input_tensor=hidden_states,
+            router_logits=router_logits,
+            correction_bias=e_score_correction_bias,
+            use_grouped_topk=use_grouped_topk,
+            top_k=num_experts_per_tok,
+            renormalize=norm_topk_prob,
+            topk_group=topk_group,
+            num_expert_group=n_group,
+            scoring_func=scoring_func,
+        )
+
+        topk_idx = topk_idx.to(torch.long)
+        num_max_dispatch_tokens_per_rank = get_deepep_num_max_dispatch_tokens_per_rank()
+        use_fp8_w8a8 = self.quant_method.method_name != "none"
+        recv_x, masked_m, handle, event, hook = dist_group_manager.ep_buffer.low_latency_dispatch(
+            hidden_states,
+            topk_idx,
+            num_max_dispatch_tokens_per_rank,
+            self.total_expert_num_contain_redundancy,
+            use_fp8=use_fp8_w8a8,
+            async_finish=False,
+            return_recv_hook=True,
+        )
+        return recv_x, masked_m, topk_idx, topk_weights, handle, hook
+
+    def select_experts_and_quant_input(
+        self,
+        hidden_states: torch.Tensor,
+        router_logits: torch.Tensor,
+        e_score_correction_bias: torch.Tensor,
+        w13: WeightPack,
+        use_grouped_topk: bool,
+        num_experts_per_tok: int,
+        norm_topk_prob: bool,
+        topk_group: int,
+        n_group: int,
+        scoring_func: str,
+    ):
+        topk_weights, topk_idx = self._select_experts(
+            input_tensor=hidden_states,
+            router_logits=router_logits,
+            correction_bias=e_score_correction_bias,
+            use_grouped_topk=use_grouped_topk,
+            top_k=num_experts_per_tok,
+            renormalize=norm_topk_prob,
+            topk_group=topk_group,
+            num_expert_group=n_group,
+            scoring_func=scoring_func,
+        )
+        w13_weight, w13_scale = w13.weight, w13.weight_scale
+        block_size_k = 0
+        if w13_weight.ndim == 3:
+            block_size_k = w13_weight.shape[2] // w13_scale.shape[2]
+        assert block_size_k == 128, "block_size_k must be 128"
+        qinput_tensor, input_scale = per_token_group_quant_fp8(hidden_states, block_size_k, dtype=w13_weight.dtype)
+        return topk_weights, topk_idx.to(torch.long), (qinput_tensor, input_scale)
+
+    def dispatch(
+        self,
+        qinput_tensor: Tuple[torch.Tensor],
+        topk_idx: torch.Tensor,
+        topk_weights: torch.Tensor,
+        overlap_event: Optional[Any] = None,
+    ):
+        buffer = dist_group_manager.ep_buffer
+        # get_dispatch_layout
+        (
+            num_tokens_per_rank,
+            num_tokens_per_rdma_rank,
+            num_tokens_per_expert,
+            is_token_in_rank,
+            previous_event,
+        ) = buffer.get_dispatch_layout(
+            topk_idx,
+            self.total_expert_num_contain_redundancy,
+            previous_event=overlap_event,
+            async_finish=True,
+            allocate_on_comm_stream=True,
+        )
+        recv_x, recv_topk_idx, recv_topk_weights, num_recv_tokens_per_expert_list, handle, event = buffer.dispatch(
+            qinput_tensor,
+            topk_idx=topk_idx,
+            topk_weights=topk_weights,
+            num_tokens_per_rank=num_tokens_per_rank,
+            num_tokens_per_rdma_rank=num_tokens_per_rdma_rank,
+            is_token_in_rank=is_token_in_rank,
+            num_tokens_per_expert=num_tokens_per_expert,
+            previous_event=previous_event,
+            async_finish=True,
+            allocate_on_comm_stream=True,
+            expert_alignment=128,
+        )
+
+        def hook():
+            event.current_stream_wait()
+
+        return recv_x, recv_topk_idx, recv_topk_weights, num_recv_tokens_per_expert_list, handle, hook
+
+    def masked_group_gemm(
+        self,
+        recv_x: Tuple[torch.Tensor],
+        w13: WeightPack,
+        w2: WeightPack,
+        masked_m: torch.Tensor,
+        dtype: torch.dtype,
+        expected_m: int,
+    ):
+        w13_weight, w13_scale = w13.weight, w13.weight_scale
+        w2_weight, w2_scale = w2.weight, w2.weight_scale
+        return masked_group_gemm(
+            recv_x, masked_m, dtype, w13_weight, w13_scale, w2_weight, w2_scale, expected_m=expected_m
+        )
+
+    def prefilled_group_gemm(
+        self,
+        num_recv_tokens_per_expert_list,
+        recv_x: Tuple[torch.Tensor],
+        recv_topk_idx: torch.Tensor,
+        recv_topk_weights: torch.Tensor,
+        w13: WeightPack,
+        w2: WeightPack,
+        hidden_dtype=torch.bfloat16,
+    ):
+        device = recv_x[0].device
+        w13_weight, w13_scale = w13.weight, w13.weight_scale
+        w2_weight, w2_scale = w2.weight, w2.weight_scale
+        _, K = recv_x[0].shape
+        _, N, _ = w13_weight.shape
+        block_size = self.quant_method.block_size
+        # scatter
+        all_tokens = sum(num_recv_tokens_per_expert_list)  # calcu padding all nums.
+        # gather_out shape [recive_num_tokens, hidden]
+        gather_out = torch.empty_like(recv_x[0], device=device, dtype=hidden_dtype)
+        if all_tokens > 0:
+            input_tensor = [
+                torch.empty((all_tokens, K), device=device, dtype=recv_x[0].dtype),
+                torch.empty((all_tokens, K // 128), device=device, dtype=torch.float32),
+            ]
+            # when m_indices is filled ok.
+            # m_indices show token use which expert, example, [0, 0, 0, 0, .... 1, 1, 1, 1,...., cur_expert_num - 1, ..]
+            # the count of 0 is num_recv_tokens_per_expert_list[0], the count of 1 is num_recv_tokens_per_expert_list[1]
+            # ...
+            m_indices = torch.empty(all_tokens, device=device, dtype=torch.int32)
+            # output_index shape [recive_num_tokens, topk_num]
+            # output_index use to show the token index in input_tensor
+            output_index = torch.empty_like(recv_topk_idx)
+
+            num_recv_tokens_per_expert = torch.tensor(
+                num_recv_tokens_per_expert_list, dtype=torch.int32, pin_memory=True, device="cpu"
+            ).cuda(non_blocking=True)
+
+            expert_start_loc = torch.empty_like(num_recv_tokens_per_expert)
+
+            ep_scatter(
+                recv_x[0],
+                recv_x[1],
+                recv_topk_idx,
+                num_recv_tokens_per_expert,
+                expert_start_loc,
+                input_tensor[0],
+                input_tensor[1],
+                m_indices,
+                output_index,
+            )
+            input_tensor[1] = tma_align_input_scale(input_tensor[1])
+            # groupgemm (contiguous layout)
+            gemm_out_a = torch.empty((all_tokens, N), device=device, dtype=hidden_dtype)
+
+            _deepgemm_grouped_fp8_nt_contiguous(input_tensor, (w13_weight, w13_scale), gemm_out_a, m_indices)
+
+            # silu_and_mul_fwd + qaunt
+            # TODO fused kernel
+            silu_out = torch.empty((all_tokens, N // 2), device=device, dtype=hidden_dtype)
+
+            silu_and_mul_fwd(gemm_out_a.view(-1, N), silu_out)
+            qsilu_out, qsilu_out_scale = per_token_group_quant_fp8(
+                silu_out, block_size, dtype=w13_weight.dtype, column_major_scales=True, scale_tma_aligned=True
+            )
+
+            # groupgemm (contiguous layout)
+            gemm_out_b = torch.empty((all_tokens, K), device=device, dtype=hidden_dtype)
+
+            _deepgemm_grouped_fp8_nt_contiguous(
+                (qsilu_out, qsilu_out_scale), (w2_weight, w2_scale), gemm_out_b, m_indices
+            )
+            # gather and local reduce
+            ep_gather(gemm_out_b, recv_topk_idx, recv_topk_weights, output_index, gather_out)
+        else:
+            ######################################## warning ##################################################
+            # here is used to match autotune feature, make moe model run same triton kernel in different rank.
+            # in some special case, one rank will recv 0 token, so add a token to make it run triton kernel.
+            if Autotuner.is_autotune_warmup():
+                _gemm_out_a = torch.zeros((1, N), device=device, dtype=hidden_dtype)
+                _silu_out = torch.zeros((1, N // 2), device=device, dtype=hidden_dtype)
+                silu_and_mul_fwd(_gemm_out_a.view(-1, N), _silu_out)
+                _gemm_out_a, _silu_out = None, None
+
+        return gather_out
+
+    def low_latency_combine(
+        self,
+        gemm_out_b: torch.Tensor,
+        topk_idx: torch.Tensor,
+        topk_weights: torch.Tensor,
+        handle: Any,
+    ):
+        combined_x, event_overlap, hook = dist_group_manager.ep_buffer.low_latency_combine(
+            gemm_out_b, topk_idx, topk_weights, handle, async_finish=False, return_recv_hook=True
+        )
+        return combined_x, hook
+
+    def combine(
+        self,
+        gemm_out_b: torch.Tensor,
+        handle: Any,
+        overlap_event: Optional[Any] = None,
+    ):
+        # normal combine
+        combined_x, _, event = dist_group_manager.ep_buffer.combine(
+            gemm_out_b,
+            handle,
+            topk_weights=None,
+            async_finish=True,
+            previous_event=overlap_event,
+            allocate_on_comm_stream=True,
+        )
+
+        def hook():
+            event.current_stream_wait()
+
+        return combined_x, hook
diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/impl/marlin_impl.py b/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/impl/marlin_impl.py
new file mode 100644
index 000000000..bdccbc0ee
--- /dev/null
+++ b/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/impl/marlin_impl.py
@@ -0,0 +1,56 @@
+import torch
+from .triton_impl import FuseMoeTriton
+from lightllm.common.quantization.quantize_method import (
+    WeightPack,
+)
+from typing import Optional
+
+
+class FuseMoeMarlin(FuseMoeTriton):
+    def create_workspace(self):
+        from lightllm.utils.vllm_utils import HAS_VLLM
+
+        assert HAS_VLLM, "moe awq marlin quantization requires kernels of vllm"
+        from vllm.model_executor.layers.quantization.utils.marlin_utils import (
+            marlin_make_workspace_new,
+        )
+
+        return marlin_make_workspace_new(torch.device("cuda"), 4)
+
+    def _fused_experts(
+        self,
+        input_tensor: torch.Tensor,
+        w13: WeightPack,
+        w2: WeightPack,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        router_logits: Optional[torch.Tensor] = None,
+        is_prefill: Optional[bool] = None,
+    ):
+
+        w1_weight, w1_scale, w1_zero_point = w13.weight, w13.weight_scale, w13.weight_zero_point
+        w2_weight, w2_scale, w2_zero_point = w2.weight, w2.weight_scale, w2.weight_zero_point
+
+        from vllm.model_executor.layers.fused_moe.fused_marlin_moe import fused_marlin_moe
+
+        fused_marlin_moe(
+            input_tensor,
+            w1_weight,
+            w2_weight,
+            None,
+            None,
+            w1_scale,
+            w2_scale,
+            router_logits,
+            topk_weights,
+            topk_ids,
+            quant_type_id=self.quant_method.vllm_quant_type.id,
+            apply_router_weight_on_input=False,
+            global_num_experts=-1,
+            expert_map=None,
+            w1_zeros=w1_zero_point,
+            w2_zeros=w2_zero_point,
+            workspace=self.workspace,
+            inplace=True,
+        )
+        return input_tensor
diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/impl/triton_impl.py b/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/impl/triton_impl.py
new file mode 100644
index 000000000..9965246a2
--- /dev/null
+++ b/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/impl/triton_impl.py
@@ -0,0 +1,138 @@
+import torch
+from typing import Optional
+from lightllm.common.quantization.no_quant import WeightPack
+from lightllm.common.quantization.quantize_method import QuantizationMethod
+from .base_impl import FuseMoeBaseImpl
+
+
+class FuseMoeTriton(FuseMoeBaseImpl):
+    def __init__(
+        self,
+        n_routed_experts: int,
+        num_fused_shared_experts: int,
+        redundancy_expert_num: int,
+        routed_scaling_factor: float,
+        quant_method: QuantizationMethod,
+    ):
+        super().__init__(
+            n_routed_experts, num_fused_shared_experts, redundancy_expert_num, routed_scaling_factor, quant_method
+        )
+
+    def create_workspace(self):
+        return None
+
+    def _select_experts(
+        self,
+        input_tensor: torch.Tensor,
+        router_logits: torch.Tensor,
+        correction_bias: Optional[torch.Tensor],
+        top_k: int,
+        renormalize: bool,
+        use_grouped_topk: bool,
+        topk_group: int,
+        num_expert_group: int,
+        scoring_func: str,
+    ):
+        """Select experts and return topk weights and ids."""
+        from lightllm.common.basemodel.triton_kernel.fused_moe.topk_select import select_experts
+
+        topk_weights, topk_ids = select_experts(
+            hidden_states=input_tensor,
+            router_logits=router_logits,
+            correction_bias=correction_bias,
+            use_grouped_topk=use_grouped_topk,
+            top_k=top_k,
+            renormalize=renormalize,
+            topk_group=topk_group,
+            num_expert_group=num_expert_group,
+            scoring_func=scoring_func,
+        )
+        topk_weights.mul_(self.routed_scaling_factor)
+        if self.num_fused_shared_experts > 0:
+            pad_topk_ids = (
+                torch.arange(
+                    start=self.n_routed_experts,
+                    end=self.n_routed_experts + self.num_fused_shared_experts,
+                    step=1,
+                    dtype=topk_ids.dtype,
+                    device="cuda",
+                )
+                .view(1, self.num_fused_shared_experts)
+                .repeat(topk_ids.shape[0], 1)
+            )
+            pad_topk_weights = torch.full(
+                (topk_weights.shape[0], self.num_fused_shared_experts),
+                fill_value=1.0,
+                device="cuda",
+                dtype=topk_weights.dtype,
+            )
+
+            topk_ids = torch.cat([topk_ids, pad_topk_ids], dim=1)
+            topk_weights = torch.cat([topk_weights, pad_topk_weights], dim=1)
+        return topk_weights, topk_ids
+
+    def _fused_experts(
+        self,
+        input_tensor: torch.Tensor,
+        w13: WeightPack,
+        w2: WeightPack,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        router_logits: Optional[torch.Tensor] = None,
+        is_prefill: bool = False,
+    ):
+        w13_weight, w13_scale = w13.weight, w13.weight_scale
+        w2_weight, w2_scale = w2.weight, w2.weight_scale
+        use_fp8_w8a8 = w13_weight.dtype == torch.float8_e4m3fn
+
+        from lightllm.common.basemodel.triton_kernel.fused_moe.grouped_fused_moe import fused_experts
+
+        fused_experts(
+            hidden_states=input_tensor,
+            w1=w13_weight,
+            w2=w2_weight,
+            topk_weights=topk_weights,
+            topk_ids=topk_ids,
+            inplace=True,
+            use_fp8_w8a8=use_fp8_w8a8,
+            w1_scale=w13_scale,
+            w2_scale=w2_scale,
+        )
+        return input_tensor
+
+    def __call__(
+        self,
+        input_tensor: torch.Tensor,
+        router_logits: torch.Tensor,
+        w13: WeightPack,
+        w2: WeightPack,
+        correction_bias: Optional[torch.Tensor],
+        scoring_func: str,
+        top_k: int,
+        renormalize: bool,
+        use_grouped_topk: bool,
+        topk_group: int,
+        num_expert_group: int,
+        is_prefill: Optional[bool] = None,
+    ):
+        topk_weights, topk_ids = self._select_experts(
+            input_tensor=input_tensor,
+            router_logits=router_logits,
+            correction_bias=correction_bias,
+            top_k=top_k,
+            renormalize=renormalize,
+            use_grouped_topk=use_grouped_topk,
+            topk_group=topk_group,
+            num_expert_group=num_expert_group,
+            scoring_func=scoring_func,
+        )
+        output = self._fused_experts(
+            input_tensor=input_tensor,
+            w13=w13,
+            w2=w2,
+            topk_weights=topk_weights,
+            topk_ids=topk_ids,
+            router_logits=router_logits,
+            is_prefill=is_prefill,
+        )
+        return output
diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/mm_weight.py b/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/mm_weight.py
index 3ba4d3e59..1133e4d6a 100644
--- a/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/mm_weight.py
+++ b/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/mm_weight.py
@@ -65,7 +65,6 @@ def mm(
 
     def gen_weight_quant_param_names(self, quant_method: Optional[QuantizationMethod]):
         if quant_method is None:
-            self.quanted_weight_names = None
             self.weight_zero_point_names = None
             self.weight_scale_names = None
             return
@@ -86,9 +85,7 @@ def gen_weight_quant_param_names(self, quant_method: Optional[QuantizationMethod
                 quanted_weight_names.append(weight_name)
 
         if len(quanted_weight_names) != 0:
-            self.quanted_weight_names = quanted_weight_names
-        else:
-            self.quanted_weight_names = None
+            self.weight_names = quanted_weight_names
 
         if len(weight_scale_names) != 0:
             self.weight_scale_names = weight_scale_names
@@ -106,10 +103,6 @@ def load_hf_weights(self, weights):
         for sub_child_index, param_name in enumerate(self.weight_names):
             self._load_weight(param_name=param_name, weights=weights, sub_child_index=sub_child_index)
 
-        if self.quanted_weight_names is not None:
-            for sub_child_index, param_name in enumerate(self.quanted_weight_names):
-                self._load_weight(param_name=param_name, weights=weights, sub_child_index=sub_child_index)
-
         if self.bias_names is not None:
             for sub_child_index, param_name in enumerate(self.bias_names):
                 self._load_bias(param_name=param_name, weights=weights, sub_child_index=sub_child_index)
@@ -124,10 +117,11 @@ def _create_weight(self):
         self.bias = None
         if self.bias_names is not None:
             self.bias = torch.empty(self.cusum_out_dims[-1], dtype=self.data_type_).cuda(get_current_device_id())
+            self.bias._load_ok = [False] * len(self.bias_names)
         self.mm_param: WeightPack = self.quant_method.create_weight(
             in_dim=self.in_dim, out_dim=sum(self.out_dims), dtype=self.data_type_, device_id=get_current_device_id()
         )
-        self.load_cnt = 0
+        self.mm_param.initialize_load_status(len(self.weight_names))
         return
 
     # 执行顺序
@@ -139,9 +133,13 @@ def _load_weight(
             start_idx = self.cusum_out_dims[sub_child_index]
             if self.quant_method.weight_need_quanted(weight):
                 self.quant_method.quantize(weight, self.mm_param, offset=start_idx)
+                # weight_scale and zero_point will be computed during online quantization.
+                # so we set them to True here.
+                self.mm_param.load_ok[sub_child_index][1] = True
+                self.mm_param.load_ok[sub_child_index][2] = True
             else:
                 self.quant_method.load_weight(weight, self.mm_param, start_idx)
-            self.load_cnt += 1
+            self.mm_param.load_ok[sub_child_index][0] = True
         return
 
     def _load_bias(
@@ -151,7 +149,8 @@ def _load_bias(
             bias = self.param_slicer._slice_bias(weights[param_name])
             start_idx = self.cusum_out_dims[sub_child_index]
             end_idx = start_idx + bias.shape[0]
-            self.mm_param.bias[start_idx:end_idx].copy_(bias)
+            self.bias[start_idx:end_idx].copy_(bias)
+            self.bias._load_ok[sub_child_index] = True
         return
 
     def _load_weight_scale(
@@ -161,7 +160,7 @@ def _load_weight_scale(
             weight_scale = self.param_slicer._slice_weight_scale(weights[param_name])
             start_idx = self.cusum_out_dims[sub_child_index]
             self.quant_method.load_weight_scale(weight_scale, self.mm_param, start_idx)
-            self.load_cnt += 1
+            self.mm_param.load_ok[sub_child_index][1] = True
         return
 
     def _load_weight_zero_point(
@@ -171,14 +170,15 @@ def _load_weight_zero_point(
             weight_zero_point = self.param_slicer._slice_weight_zero_point(weights[param_name])
             start_idx = self.cusum_out_dims[sub_child_index]
             self.quant_method.load_weight_zero_point(weight_zero_point, self.mm_param, start_idx)
-            self.load_cnt += 1
+            self.mm_param.load_ok[sub_child_index][2] = True
         return
 
     def verify_load(self):
-        if self.quant_method.method_name != "none":
-            return self.load_cnt == len(self.weight_names) * 2
-        else:
-            return self.load_cnt == len(self.weight_names)
+        mm_param_load_ok = all(all(load_ok_list) for load_ok_list in self.mm_param.load_ok)
+        bias_load_ok = True if self.bias is None else all(self.bias._load_ok)
+        if not (mm_param_load_ok and bias_load_ok):
+            logger.warning(f"mm_param_load_ok: {self.mm_param.load_ok}, bias_load_ok: {self.bias}")
+        return mm_param_load_ok and bias_load_ok
 
     def _get_tp_dim(self, dim: int) -> int:
         assert (
@@ -218,7 +218,7 @@ def __init__(
 
     def _create_weight(self):
         self.weight = torch.empty(self.dim0, self.dim1, self.dim2, dtype=self.data_type_).cuda(get_current_device_id())
-        self.load_cnt = 0
+        self.weight._load_ok = False
         return
 
     def load_hf_weights(self, weights: Dict[str, torch.Tensor]):
@@ -226,11 +226,11 @@ def load_hf_weights(self, weights: Dict[str, torch.Tensor]):
             if weight_name in weights:
                 weight = self.param_slicer._slice_weight(weights[weight_name])
                 self.weight.copy_(weight)
-                self.load_cnt += 1
+                self.weight._load_ok = True
         return
 
     def verify_load(self):
-        return self.load_cnt == len(self.weight_names)
+        return self.weight._load_ok
 
     def bmm(
         self, input_tensor: torch.Tensor, out: Optional[torch.Tensor] = None, use_custom_tensor_mananger: bool = True
diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/norm_weight.py b/lightllm/common/basemodel/layer_weights/meta_weights/norm_weight.py
index 1a8f59723..d4717386b 100644
--- a/lightllm/common/basemodel/layer_weights/meta_weights/norm_weight.py
+++ b/lightllm/common/basemodel/layer_weights/meta_weights/norm_weight.py
@@ -19,15 +19,15 @@ def __init__(self, dim: int, weight_name: str, data_type: torch.dtype, bias_name
 
     def _create_weight(self):
         self.weight: torch.Tensor = torch.empty(self.dim, dtype=self.data_type_, device=self.device_id_)
-        self.load_cnt = 0
+        self.weight.load_ok = False
 
     def load_hf_weights(self, weights: Dict[str, torch.Tensor]):
         if self.weight_name in weights:
             self.weight.copy_(weights[self.weight_name])
-            self.load_cnt += 1
+            self.weight.load_ok = True
 
     def verify_load(self):
-        return self.load_cnt == 1
+        return self.weight.load_ok
 
     def _native_forward(
         self, input: torch.Tensor, eps: float, out: Optional[torch.Tensor] = None, alloc_func=torch.empty
@@ -84,18 +84,19 @@ def __init__(self, dim: int, weight_name: str, data_type: torch.dtype, bias_name
     def _create_weight(self):
         self.weight: torch.Tensor = torch.empty(self.dim, dtype=self.data_type_, device=self.device_id_)
         self.bias: torch.Tensor = torch.empty(self.dim, dtype=self.data_type_, device=self.device_id_)
-        self.load_cnt = 0
+        self.weight.load_ok = False
+        self.bias.load_ok = False
 
     def load_hf_weights(self, weights: Dict[str, torch.Tensor]):
         if self.weight_name in weights:
             self.weight.copy_(weights[self.weight_name])
-            self.load_cnt += 1
+            self.weight.load_ok = True
         if self.bias_name in weights:
             self.bias.copy_(weights[self.bias_name])
-            self.load_cnt += 1
+            self.bias.load_ok = True
 
     def verify_load(self):
-        return self.load_cnt == 2
+        return self.weight.load_ok and self.bias.load_ok
 
     def _native_forward(
         self, input: torch.Tensor, eps: float, out: Optional[torch.Tensor] = None, alloc_func=torch.empty
@@ -175,7 +176,7 @@ def load_hf_weights(self, weights):
             self.weight[:, end - start].copy_(t_weight[start:end].to(self.data_type_))
             # the padding part is zero
             self.weight[:, end:].zero_()
-            self.load_cnt += 1
+            self.weight.load_ok = True
 
 
 class NoTpGEMMANormWeight(RMSNormWeight):
diff --git a/lightllm/common/fused_moe/__init__.py b/lightllm/common/basemodel/triton_kernel/fused_moe/__init__.py
similarity index 100%
rename from lightllm/common/fused_moe/__init__.py
rename to lightllm/common/basemodel/triton_kernel/fused_moe/__init__.py
diff --git a/lightllm/common/fused_moe/deepep_scatter_gather.py b/lightllm/common/basemodel/triton_kernel/fused_moe/deepep_scatter_gather.py
similarity index 100%
rename from lightllm/common/fused_moe/deepep_scatter_gather.py
rename to lightllm/common/basemodel/triton_kernel/fused_moe/deepep_scatter_gather.py
diff --git a/lightllm/common/fused_moe/grouped_fused_moe.py b/lightllm/common/basemodel/triton_kernel/fused_moe/grouped_fused_moe.py
similarity index 100%
rename from lightllm/common/fused_moe/grouped_fused_moe.py
rename to lightllm/common/basemodel/triton_kernel/fused_moe/grouped_fused_moe.py
diff --git a/lightllm/common/fused_moe/grouped_fused_moe_ep.py b/lightllm/common/basemodel/triton_kernel/fused_moe/grouped_fused_moe_ep.py
similarity index 96%
rename from lightllm/common/fused_moe/grouped_fused_moe_ep.py
rename to lightllm/common/basemodel/triton_kernel/fused_moe/grouped_fused_moe_ep.py
index 2a577890b..2c6d013bd 100644
--- a/lightllm/common/fused_moe/grouped_fused_moe_ep.py
+++ b/lightllm/common/basemodel/triton_kernel/fused_moe/grouped_fused_moe_ep.py
@@ -6,13 +6,15 @@
 from typing import Any, Callable, Dict, Optional, Tuple
 import torch.distributed as dist
 from lightllm.utils.log_utils import init_logger
-from lightllm.common.fused_moe.moe_silu_and_mul import silu_and_mul_fwd
-from lightllm.common.fused_moe.moe_silu_and_mul_mix_quant_ep import silu_and_mul_masked_post_quant_fwd
+from lightllm.common.basemodel.triton_kernel.fused_moe.moe_silu_and_mul import silu_and_mul_fwd
+from lightllm.common.basemodel.triton_kernel.fused_moe.moe_silu_and_mul_mix_quant_ep import (
+    silu_and_mul_masked_post_quant_fwd,
+)
 from lightllm.common.basemodel.triton_kernel.quantization.fp8act_quant_kernel import (
     per_token_group_quant_fp8,
     tma_align_input_scale,
 )
-from lightllm.common.fused_moe.deepep_scatter_gather import ep_scatter, ep_gather
+from lightllm.common.basemodel.triton_kernel.fused_moe.deepep_scatter_gather import ep_scatter, ep_gather
 from lightllm.utils.envs_utils import get_deepep_num_max_dispatch_tokens_per_rank
 from lightllm.common.triton_utils.autotuner import Autotuner
 import numpy as np
diff --git a/lightllm/common/fused_moe/grouped_topk.py b/lightllm/common/basemodel/triton_kernel/fused_moe/grouped_topk.py
similarity index 100%
rename from lightllm/common/fused_moe/grouped_topk.py
rename to lightllm/common/basemodel/triton_kernel/fused_moe/grouped_topk.py
diff --git a/lightllm/common/fused_moe/moe_kernel_configs.py b/lightllm/common/basemodel/triton_kernel/fused_moe/moe_kernel_configs.py
similarity index 100%
rename from lightllm/common/fused_moe/moe_kernel_configs.py
rename to lightllm/common/basemodel/triton_kernel/fused_moe/moe_kernel_configs.py
diff --git a/lightllm/common/fused_moe/moe_silu_and_mul.py b/lightllm/common/basemodel/triton_kernel/fused_moe/moe_silu_and_mul.py
similarity index 100%
rename from lightllm/common/fused_moe/moe_silu_and_mul.py
rename to lightllm/common/basemodel/triton_kernel/fused_moe/moe_silu_and_mul.py
diff --git a/lightllm/common/fused_moe/moe_silu_and_mul_config.py b/lightllm/common/basemodel/triton_kernel/fused_moe/moe_silu_and_mul_config.py
similarity index 100%
rename from lightllm/common/fused_moe/moe_silu_and_mul_config.py
rename to lightllm/common/basemodel/triton_kernel/fused_moe/moe_silu_and_mul_config.py
diff --git a/lightllm/common/fused_moe/moe_silu_and_mul_mix_quant_ep.py b/lightllm/common/basemodel/triton_kernel/fused_moe/moe_silu_and_mul_mix_quant_ep.py
similarity index 100%
rename from lightllm/common/fused_moe/moe_silu_and_mul_mix_quant_ep.py
rename to lightllm/common/basemodel/triton_kernel/fused_moe/moe_silu_and_mul_mix_quant_ep.py
diff --git a/lightllm/common/fused_moe/moe_sum_recude_config.py b/lightllm/common/basemodel/triton_kernel/fused_moe/moe_sum_recude_config.py
similarity index 100%
rename from lightllm/common/fused_moe/moe_sum_recude_config.py
rename to lightllm/common/basemodel/triton_kernel/fused_moe/moe_sum_recude_config.py
diff --git a/lightllm/common/fused_moe/moe_sum_reduce.py b/lightllm/common/basemodel/triton_kernel/fused_moe/moe_sum_reduce.py
similarity index 100%
rename from lightllm/common/fused_moe/moe_sum_reduce.py
rename to lightllm/common/basemodel/triton_kernel/fused_moe/moe_sum_reduce.py
diff --git a/lightllm/common/fused_moe/softmax_topk.py b/lightllm/common/basemodel/triton_kernel/fused_moe/softmax_topk.py
similarity index 100%
rename from lightllm/common/fused_moe/softmax_topk.py
rename to lightllm/common/basemodel/triton_kernel/fused_moe/softmax_topk.py
diff --git a/lightllm/common/fused_moe/topk_select.py b/lightllm/common/basemodel/triton_kernel/fused_moe/topk_select.py
similarity index 96%
rename from lightllm/common/fused_moe/topk_select.py
rename to lightllm/common/basemodel/triton_kernel/fused_moe/topk_select.py
index 5206800ef..72c3a381e 100644
--- a/lightllm/common/fused_moe/topk_select.py
+++ b/lightllm/common/basemodel/triton_kernel/fused_moe/topk_select.py
@@ -22,7 +22,7 @@
 from lightllm.utils.sgl_utils import sgl_ops
 from lightllm.utils.light_utils import light_ops
 from typing import Callable, List, Optional, Tuple
-from lightllm.common.fused_moe.softmax_topk import softmax_topk
+from lightllm.common.basemodel.triton_kernel.fused_moe.softmax_topk import softmax_topk
 from lightllm.common.triton_utils.autotuner import Autotuner
 
 use_cuda_grouped_topk = os.getenv("LIGHTLLM_CUDA_GROUPED_TOPK", "False").upper() in ["ON", "TRUE", "1"]
@@ -177,8 +177,8 @@ def select_experts(
     scoring_func: str = "softmax",
     custom_routing_function: Optional[Callable] = None,
 ):
-    from lightllm.common.fused_moe.topk_select import fused_topk
-    from lightllm.common.fused_moe.grouped_topk import triton_grouped_topk
+    from lightllm.common.basemodel.triton_kernel.fused_moe.topk_select import fused_topk
+    from lightllm.common.basemodel.triton_kernel.fused_moe.grouped_topk import triton_grouped_topk
 
     # DeekSeekv2 uses grouped_top_k
     if use_grouped_topk:
diff --git a/lightllm/common/quantization/quantize_method.py b/lightllm/common/quantization/quantize_method.py
index 77e59465e..4350307f1 100644
--- a/lightllm/common/quantization/quantize_method.py
+++ b/lightllm/common/quantization/quantize_method.py
@@ -18,6 +18,10 @@ def get_expert(self, expert_idx: int):
         weight_zero_point = self.weight_zero_point[expert_idx] if self.weight_zero_point is not None else None
         return WeightPack(weight=weight, weight_scale=weight_scale, weight_zero_point=weight_zero_point)
 
+    def initialize_load_status(self, weight_num: int):
+        initial_loaded_status = [False, self.weight_scale is None, self.weight_zero_point is None]
+        self.load_ok = [initial_loaded_status.copy() for _ in range(weight_num)]
+
 
 class QuantizationMethod(ABC):
     def __init__(self):
diff --git a/lightllm/distributed/communication_op.py b/lightllm/distributed/communication_op.py
index d5c96f821..d606d757c 100644
--- a/lightllm/distributed/communication_op.py
+++ b/lightllm/distributed/communication_op.py
@@ -136,9 +136,9 @@ def get_group(self, group_index: int) -> CustomProcessGroup:
         return self.groups[group_index]
 
     def new_deepep_group(self, n_routed_experts, hidden_size):
-        moe_mode = os.getenv("MOE_MODE", "TP")
+        enable_ep_moe = get_env_start_args().enable_ep_moe
         num_max_dispatch_tokens_per_rank = get_deepep_num_max_dispatch_tokens_per_rank()
-        if moe_mode == "TP":
+        if not enable_ep_moe:
             self.ep_buffer = None
             return
         assert HAS_DEEPEP, "deep_ep is required for expert parallelism"
diff --git a/lightllm/models/deepseek2/layer_infer/transformer_layer_infer.py b/lightllm/models/deepseek2/layer_infer/transformer_layer_infer.py
index 801ab6aba..e1e435cce 100644
--- a/lightllm/models/deepseek2/layer_infer/transformer_layer_infer.py
+++ b/lightllm/models/deepseek2/layer_infer/transformer_layer_infer.py
@@ -63,8 +63,8 @@ def _bind_func(self):
 
     def _bind_ffn(self):
         if self.is_moe:
-            moe_mode = os.environ.get("MOE_MODE", "TP")
-            if moe_mode == "EP":
+            enable_ep_moe = get_env_start_args().enable_ep_moe
+            if enable_ep_moe:
                 self._ffn = partial(Deepseek2TransformerLayerInfer._moe_ffn_edp, self)
                 self._tpsp_ffn = self._tpsp_ffn_ep
             else:
diff --git a/lightllm/models/deepseek2/layer_weights/transformer_layer_weight.py b/lightllm/models/deepseek2/layer_weights/transformer_layer_weight.py
index 1e8d572e1..783e70e64 100644
--- a/lightllm/models/deepseek2/layer_weights/transformer_layer_weight.py
+++ b/lightllm/models/deepseek2/layer_weights/transformer_layer_weight.py
@@ -9,8 +9,7 @@
     ROWBMMWeight,
     COLMMWeight,
     RMSNormWeight,
-    FusedMoeWeightEP,
-    create_tp_moe_wegiht_obj,
+    FusedMoeWeight,
 )
 from ..triton_kernel.weight_dequant import weight_dequant
 
@@ -39,9 +38,8 @@ def _parse_config(self):
         self.kv_lora_rank = self.network_config_["kv_lora_rank"]
         self.num_fused_shared_experts = 0
         if get_env_start_args().enable_fused_shared_experts and self.is_moe:
-            # MOE_MODE 处于 TP 模式下才能使能 enable_fused_shared_experts
-            moe_mode = os.getenv("MOE_MODE", "TP")
-            assert moe_mode == "TP"
+            # enable_fused_shared_experts can only work with tensor parallelism
+            assert not get_env_start_args().enable_ep_moe, "enable_fused_shared_experts can only work with tp mode."
             self.num_fused_shared_experts = self.network_config_.get("n_shared_experts", 0)
         self.n_embed = self.network_config_["hidden_size"]
         self.n_inter = self.network_config_["intermediate_size"]
@@ -97,7 +95,6 @@ def load_hf_weights(self, weights):
         weight_scale_suffix = None
         if self.quant_cfg.quantized_weight:
             weight_scale_suffix = kv_b_quant_method.weight_scale_suffix
-
         if f"model.layers.{self.layer_num_}.self_attn.kv_b_proj.weight" in weights:
             kv_b_proj_ = weights[f"model.layers.{self.layer_num_}.self_attn.kv_b_proj.weight"]
             # for deepseek_v3, the bmm operator is not quantized
@@ -187,9 +184,9 @@ def _init_qkvo(self):
         )
 
     def _load_mlp(self, mlp_prefix, is_shared_experts=False):
-        moe_mode = os.getenv("MOE_MODE", "TP")
+        enable_ep_moe = get_env_start_args().enable_ep_moe
         mlp_inter = self.moe_inter if is_shared_experts else self.n_inter
-        if self.is_moe and moe_mode == "EP":
+        if self.is_moe and enable_ep_moe:
             self.gate_up_proj = ROWMMWeight(
                 in_dim=self.n_embed,
                 out_dims=[mlp_inter, mlp_inter],
@@ -243,38 +240,21 @@ def _init_moe(self):
         # == 0 时，说明不存在融合共享专家，共享专家单独加载和进行推理。
         if self.num_fused_shared_experts == 0:
             self._load_mlp(f"model.layers.{self.layer_num_}.mlp.shared_experts", is_shared_experts=True)
-        moe_mode = os.getenv("MOE_MODE", "TP")
-        assert moe_mode in ["EP", "TP"]
-        if moe_mode == "TP":
-            self.experts = create_tp_moe_wegiht_obj(
-                gate_proj_name="gate_proj",
-                down_proj_name="down_proj",
-                up_proj_name="up_proj",
-                e_score_correction_bias_name=self.e_score_correction_bias_name,
-                weight_prefix=f"model.layers.{self.layer_num_}.mlp.experts",
-                n_routed_experts=self.n_routed_experts,
-                num_fused_shared_experts=self.num_fused_shared_experts,
-                split_inter_size=moe_intermediate_size // self.tp_world_size_,
-                data_type=self.data_type_,
-                network_config=self.network_config_,
-                layer_num=self.layer_num_,
-                quant_cfg=self.quant_cfg,
-            )
-        elif moe_mode == "EP":
-            self.experts = FusedMoeWeightEP(
-                gate_proj_name="gate_proj",
-                down_proj_name="down_proj",
-                up_proj_name="up_proj",
-                e_score_correction_bias_name=self.e_score_correction_bias_name,
-                weight_prefix=f"model.layers.{self.layer_num_}.mlp.experts",
-                n_routed_experts=self.n_routed_experts,
-                data_type=self.data_type_,
-                network_config=self.network_config_,
-                layer_num=self.layer_num_,
-                quant_cfg=self.quant_cfg,
-            )
-        else:
-            raise ValueError(f"Unsupported moe mode: {moe_mode}")
+        self.experts = FusedMoeWeight(
+            gate_proj_name="gate_proj",
+            down_proj_name="down_proj",
+            up_proj_name="up_proj",
+            e_score_correction_bias_name=self.e_score_correction_bias_name,
+            weight_prefix=f"model.layers.{self.layer_num_}.mlp.experts",
+            n_routed_experts=self.n_routed_experts,
+            hidden_size=self.n_embed,
+            moe_intermediate_size=moe_intermediate_size,
+            data_type=self.data_type_,
+            quant_method=self.quant_cfg.get_quant_method(self.layer_num_, "fused_moe"),
+            num_fused_shared_experts=self.num_fused_shared_experts,
+            layer_num=self.layer_num_,
+            network_config=self.network_config_,
+        )
 
     def _init_ffn(self):
         self._load_mlp(f"model.layers.{self.layer_num_}.mlp")
diff --git a/lightllm/models/gpt_oss/layer_weights/transformer_layer_weight.py b/lightllm/models/gpt_oss/layer_weights/transformer_layer_weight.py
index 0e7f4c873..e6d58c3b2 100644
--- a/lightllm/models/gpt_oss/layer_weights/transformer_layer_weight.py
+++ b/lightllm/models/gpt_oss/layer_weights/transformer_layer_weight.py
@@ -9,6 +9,7 @@
 from lightllm.common.basemodel.layer_weights.meta_weights import TpAttSinkWeight
 from lightllm.models.llama.layer_weights.transformer_layer_weight import LlamaTransformerLayerWeight
 from lightllm.utils.log_utils import init_logger
+from lightllm.utils.envs_utils import get_env_start_args
 
 logger = init_logger(__name__)
 
@@ -25,10 +26,10 @@ def __init__(
         return
 
     def _init_moe(self):
-        moe_mode = os.getenv("MOE_MODE", "TP")
+        enable_ep_moe = get_env_start_args().enable_ep_moe
         moe_intermediate_size = self.network_config_["intermediate_size"]
         n_routed_experts = self.network_config_["num_local_experts"]
-        assert moe_mode in ["TP"], "For now, GPT-OSS type model only support MOE TP mode."
+        assert not enable_ep_moe, "For now, GPT-OSS type model only support MOE TP mode."
 
         self.moe_gate = ROWMMWeight(
             in_dim=self.n_embed,
diff --git a/lightllm/models/llama/layer_infer/transformer_layer_infer.py b/lightllm/models/llama/layer_infer/transformer_layer_infer.py
index 820c5efa0..dc6f10be5 100644
--- a/lightllm/models/llama/layer_infer/transformer_layer_infer.py
+++ b/lightllm/models/llama/layer_infer/transformer_layer_infer.py
@@ -4,7 +4,7 @@
 from functools import partial
 from lightllm.models.llama.layer_weights.transformer_layer_weight import LlamaTransformerLayerWeight
 from lightllm.models.llama.triton_kernel.rotary_emb import rotary_emb_fwd
-from lightllm.common.fused_moe.moe_silu_and_mul import silu_and_mul_fwd
+from lightllm.common.basemodel.triton_kernel.fused_moe.moe_silu_and_mul import silu_and_mul_fwd
 from lightllm.models.llama.infer_struct import LlamaInferStateInfo
 from lightllm.common.basemodel import TransformerLayerInferTpl
 from lightllm.distributed.communication_op import all_gather_into_tensor, reduce_scatter_tensor
diff --git a/lightllm/models/mixtral/layer_weights/transformer_layer_weight.py b/lightllm/models/mixtral/layer_weights/transformer_layer_weight.py
index fa20a63f9..51c62fd4c 100644
--- a/lightllm/models/mixtral/layer_weights/transformer_layer_weight.py
+++ b/lightllm/models/mixtral/layer_weights/transformer_layer_weight.py
@@ -2,7 +2,8 @@
 from lightllm.utils.log_utils import init_logger
 from lightllm.utils.envs_utils import enable_env_vars
 from lightllm.models.llama.layer_weights.transformer_layer_weight import LlamaTransformerLayerWeight
-from lightllm.common.basemodel.layer_weights.meta_weights import ROWMMWeight, FusedMoeWeightEP, create_tp_moe_wegiht_obj
+from lightllm.common.basemodel.layer_weights.meta_weights import ROWMMWeight, FusedMoeWeight
+from lightllm.utils.envs_utils import get_env_start_args
 
 logger = init_logger(__name__)
 
@@ -31,7 +32,6 @@ def _init_ffn(self):
 
     def _init_moe(self):
         inter_size = self.network_config_["intermediate_size"]
-        split_inter_size = inter_size // self.tp_world_size_
 
         self.moe_gate = ROWMMWeight(
             in_dim=self.n_embed,
@@ -43,25 +43,18 @@ def _init_moe(self):
             tp_rank=0,
             tp_world_size=1,  # no tensor parallelism
         )
-
-        moe_mode = os.getenv("MOE_MODE", "TP")
-        assert moe_mode in ["TP"], f"Unsupported moe mode: {moe_mode}"
-
-        if moe_mode == "TP":
-            self.experts = create_tp_moe_wegiht_obj(
-                gate_proj_name="w1",
-                down_proj_name="w2",
-                up_proj_name="w3",
-                e_score_correction_bias_name="",
-                weight_prefix=f"model.layers.{self.layer_num_}.block_sparse_moe.experts",
-                n_routed_experts=self.n_routed_experts,
-                split_inter_size=split_inter_size,
-                data_type=self.data_type_,
-                network_config=self.network_config_,
-                layer_num=self.layer_num_,
-                quant_cfg=self.quant_cfg,
-                num_fused_shared_experts=0,
-                hidden_size=self.network_config_.get("hidden_size"),
-            )
-        else:
-            raise ValueError(f"Unsupported moe mode: {moe_mode}")
+        assert get_env_start_args().enable_ep_moe, "Mixtral only support tp mode."
+        self.experts = FusedMoeWeight(
+            gate_proj_name="w1",
+            down_proj_name="w2",
+            up_proj_name="w3",
+            e_score_correction_bias_name="",
+            weight_prefix=f"model.layers.{self.layer_num_}.block_sparse_moe.experts",
+            n_routed_experts=self.n_routed_experts,
+            hidden_size=self.n_embed,
+            moe_intermediate_size=inter_size,
+            data_type=self.data_type_,
+            quant_method=self.quant_cfg.get_quant_method(self.layer_num_, "fused_moe"),
+            layer_num=self.layer_num_,
+            network_config=self.network_config_,
+        )
diff --git a/lightllm/models/qwen3_moe/layer_infer/transformer_layer_infer.py b/lightllm/models/qwen3_moe/layer_infer/transformer_layer_infer.py
index 52f9289eb..71b16cb34 100644
--- a/lightllm/models/qwen3_moe/layer_infer/transformer_layer_infer.py
+++ b/lightllm/models/qwen3_moe/layer_infer/transformer_layer_infer.py
@@ -14,6 +14,7 @@
 from lightllm.utils.log_utils import init_logger
 from lightllm.utils.dist_utils import get_global_world_size
 from lightllm.distributed.communication_op import all_gather_into_tensor, reduce_scatter_tensor
+from lightllm.utils.envs_utils import get_env_start_args
 
 logger = init_logger(__name__)
 
@@ -41,8 +42,8 @@ def _bind_func(self):
 
     def _bind_ffn(self):
         if self.is_moe:
-            moe_mode = os.environ.get("MOE_MODE", "TP")
-            if moe_mode == "EP":
+            enable_ep_moe = get_env_start_args().enable_ep_moe
+            if enable_ep_moe:
                 self._ffn = partial(Qwen3MOETransformerLayerInfer._moe_ffn_edp, self)
                 self._tpsp_ffn = self._tpsp_ffn_ep
             else:
diff --git a/lightllm/models/qwen3_moe/layer_weights/transformer_layer_weight.py b/lightllm/models/qwen3_moe/layer_weights/transformer_layer_weight.py
index 54cf7f02d..a889609d7 100644
--- a/lightllm/models/qwen3_moe/layer_weights/transformer_layer_weight.py
+++ b/lightllm/models/qwen3_moe/layer_weights/transformer_layer_weight.py
@@ -1,6 +1,6 @@
 import os
 from lightllm.models.qwen3.layer_weights.transformer_layer_weight import Qwen3TransformerLayerWeight
-from lightllm.common.basemodel.layer_weights.meta_weights import ROWMMWeight, FusedMoeWeightEP, create_tp_moe_wegiht_obj
+from lightllm.common.basemodel.layer_weights.meta_weights import ROWMMWeight, FusedMoeWeight
 
 
 class Qwen3MOETransformerLayerWeight(Qwen3TransformerLayerWeight):
@@ -52,35 +52,17 @@ def _init_moe(self):
             tp_rank=0,
             tp_world_size=1,
         )
-        moe_mode = os.getenv("MOE_MODE", "TP")
-        assert moe_mode in ["EP", "TP"]
-        if moe_mode == "TP":
-            self.experts = create_tp_moe_wegiht_obj(
-                gate_proj_name="gate_proj",
-                down_proj_name="down_proj",
-                up_proj_name="up_proj",
-                e_score_correction_bias_name="",
-                weight_prefix=f"model.layers.{self.layer_num_}.mlp.experts",
-                n_routed_experts=self.n_routed_experts,
-                split_inter_size=moe_intermediate_size // self.tp_world_size_,
-                data_type=self.data_type_,
-                network_config=self.network_config_,
-                layer_num=self.layer_num_,
-                quant_cfg=self.quant_cfg,
-                num_fused_shared_experts=0,
-            )
-        elif moe_mode == "EP":
-            self.experts = FusedMoeWeightEP(
-                gate_proj_name="gate_proj",
-                down_proj_name="down_proj",
-                up_proj_name="up_proj",
-                e_score_correction_bias_name="",
-                weight_prefix=f"model.layers.{self.layer_num_}.mlp.experts",
-                n_routed_experts=self.n_routed_experts,
-                data_type=self.data_type_,
-                network_config=self.network_config_,
-                layer_num=self.layer_num_,
-                quant_cfg=self.quant_cfg,
-            )
-        else:
-            raise ValueError(f"Unsupported moe mode: {moe_mode}")
+        self.experts = FusedMoeWeight(
+            gate_proj_name="gate_proj",
+            down_proj_name="down_proj",
+            up_proj_name="up_proj",
+            e_score_correction_bias_name="",
+            weight_prefix=f"model.layers.{self.layer_num_}.mlp.experts",
+            n_routed_experts=self.n_routed_experts,
+            hidden_size=self.network_config_["hidden_size"],
+            moe_intermediate_size=moe_intermediate_size,
+            data_type=self.data_type_,
+            quant_method=self.quant_cfg.get_quant_method(self.layer_num_, "fused_moe"),
+            layer_num=self.layer_num_,
+            network_config=self.network_config_,
+        )
diff --git a/lightllm/models/qwen3_vl_moe/layer_weights/transformers_layer_weight.py b/lightllm/models/qwen3_vl_moe/layer_weights/transformers_layer_weight.py
index 48ddf5208..83c05ba26 100644
--- a/lightllm/models/qwen3_vl_moe/layer_weights/transformers_layer_weight.py
+++ b/lightllm/models/qwen3_vl_moe/layer_weights/transformers_layer_weight.py
@@ -1,6 +1,5 @@
 import os
 from lightllm.models.qwen3_moe.layer_weights.transformer_layer_weight import Qwen3MOETransformerLayerWeight
-from lightllm.common.basemodel.layer_weights.meta_weights import ROWMMWeight, FusedMoeWeightEP, create_tp_moe_wegiht_obj
 
 
 class Qwen3VLMOETransformerLayerWeight(Qwen3MOETransformerLayerWeight):
diff --git a/lightllm/models/vit/layer_weights/transformer_layer_weight.py b/lightllm/models/vit/layer_weights/transformer_layer_weight.py
index 8bcbe3358..54ad36786 100644
--- a/lightllm/models/vit/layer_weights/transformer_layer_weight.py
+++ b/lightllm/models/vit/layer_weights/transformer_layer_weight.py
@@ -1,5 +1,4 @@
 import os
-from turtle import TPen
 import torch
 import math
 import numpy as np
diff --git a/lightllm/server/api_cli.py b/lightllm/server/api_cli.py
index e4b5f8f2b..e49b0cc67 100644
--- a/lightllm/server/api_cli.py
+++ b/lightllm/server/api_cli.py
@@ -516,6 +516,11 @@ def make_argument_parser() -> argparse.ArgumentParser:
             " Therefore, it is recommended to set this parameter according to actual needs."
         ),
     )
+    parser.add_argument(
+        "--enable_ep_moe",
+        action="store_true",
+        help="""Whether to enable ep moe for deepseekv3 model.""",
+    )
     parser.add_argument(
         "--ep_redundancy_expert_config_path",
         type=str,
@@ -530,7 +535,7 @@ def make_argument_parser() -> argparse.ArgumentParser:
     parser.add_argument(
         "--enable_fused_shared_experts",
         action="store_true",
-        help="""Whether to enable fused shared experts for deepseekv3 model. only work when MOE_MODE=TP """,
+        help="""Whether to enable fused shared experts for deepseekv3 model. only work when tensor parallelism""",
     )
     parser.add_argument(
         "--mtp_mode",
diff --git a/test/start_scripts/README.md b/test/start_scripts/README.md
index e00af2713..8ed44a275 100644
--- a/test/start_scripts/README.md
+++ b/test/start_scripts/README.md
@@ -99,7 +99,6 @@ sh multi_pd_master/pd_decode.sh <host> <config_server_host>
 ### Environment Variables
 
 - `LOADWORKER`: Model loading thread count, recommended 8-18
-- `MOE_MODE`: Expert parallelism mode, set to EP to enable expert parallelism
 - `DISABLE_KV_TRANS_USE_P2P`: Disable P2P communication optimization to transfer kv data
 - `CUDA_VISIBLE_DEVICES`: Specify GPU devices to use
 
@@ -108,6 +107,7 @@ sh multi_pd_master/pd_decode.sh <host> <config_server_host>
 - `--model_dir`: Model file path
 - `--tp`: Tensor parallelism degree
 - `--dp`: Data parallelism degree
+- `--enable_ep_mode`: enable expert parallel 
 - `--nnodes`: Total number of nodes
 - `--node_rank`: Current node rank
 - `--nccl_host`: NCCL communication host address
diff --git a/test/start_scripts/multi_node_ep_node0.sh b/test/start_scripts/multi_node_ep_node0.sh
index cd72e6cfc..2cc6b03c9 100644
--- a/test/start_scripts/multi_node_ep_node0.sh
+++ b/test/start_scripts/multi_node_ep_node0.sh
@@ -2,14 +2,14 @@
 # nccl_host: the ip of the nccl host
 # sh multi_node_ep_node0.sh <nccl_host>
 export nccl_host=$1
-MOE_MODE=EP LOADWORKER=18 python -m lightllm.server.api_server --port 8088 \
+LOADWORKER=18 python -m lightllm.server.api_server --port 8088 \
 --model_dir /path/DeepSeek-R1 \
 --tp 16 \
 --dp 16 \
 --nnodes 2 \
 --node_rank 0 \
 --nccl_host $nccl_host \
---nccl_port 2732 
+--nccl_port 2732 --enable_ep_moe
 # if you want to enable microbatch overlap, you can uncomment the following lines
 #--enable_prefill_microbatch_overlap
 #--enable_decode_microbatch_overlap
\ No newline at end of file
diff --git a/test/start_scripts/multi_node_ep_node1.sh b/test/start_scripts/multi_node_ep_node1.sh
index 17b878a1b..cc920b0b0 100644
--- a/test/start_scripts/multi_node_ep_node1.sh
+++ b/test/start_scripts/multi_node_ep_node1.sh
@@ -2,14 +2,14 @@
 # nccl_host: the ip of the nccl host
 # sh multi_node_ep_node1.sh <nccl_host>
 export nccl_host=$1
-MOE_MODE=EP LOADWORKER=18 python -m lightllm.server.api_server --port 8088 \
+LOADWORKER=18 python -m lightllm.server.api_server --port 8088 \
 --model_dir /path/DeepSeek-R1 \
 --tp 16 \
 --dp 16 \
 --nnodes 2 \
 --node_rank 1 \
 --nccl_host $nccl_host \
---nccl_port 2732 
+--nccl_port 2732 --enable_ep_moe
 # if you want to enable microbatch overlap, you can uncomment the following lines
 #--enable_prefill_microbatch_overlap
 #--enable_decode_microbatch_overlap
\ No newline at end of file
diff --git a/test/start_scripts/multi_pd_master/pd_prefill.sh b/test/start_scripts/multi_pd_master/pd_prefill.sh
index 41ad52551..45f6c0c01 100644
--- a/test/start_scripts/multi_pd_master/pd_prefill.sh
+++ b/test/start_scripts/multi_pd_master/pd_prefill.sh
@@ -5,7 +5,7 @@
 export host=$1
 export config_server_host=$2
 nvidia-cuda-mps-control -d
-MOE_MODE=EP LOADWORKER=18 python -m lightllm.server.api_server \
+LOADWORKER=18 python -m lightllm.server.api_server \
 --model_dir /path/DeepSeek-R1 \
 --run_mode "prefill" \
 --host $host \
@@ -15,6 +15,7 @@ MOE_MODE=EP LOADWORKER=18 python -m lightllm.server.api_server \
 --nccl_port 2732 \
 --disable_cudagraph \
 --config_server_host $config_server_host \
---config_server_port 60088
+--config_server_port 60088 \
+--enable_ep_moe
 # if you want to enable microbatch overlap, you can uncomment the following lines
 #--enable_prefill_microbatch_overlap
\ No newline at end of file
diff --git a/test/start_scripts/single_node_ep.sh b/test/start_scripts/single_node_ep.sh
index e143c34ec..21d2ebaa3 100644
--- a/test/start_scripts/single_node_ep.sh
+++ b/test/start_scripts/single_node_ep.sh
@@ -1,8 +1,9 @@
 # H200 single node deepseek R1 dpep mode
-MOE_MODE=EP LOADWORKER=18 python -m lightllm.server.api_server --port 8088 \
+LOADWORKER=18 python -m lightllm.server.api_server --port 8088 \
 --model_dir /path/DeepSeek-R1 \
 --tp 8 \
---dp 8
+--dp 8 \
+--enable_ep_moe
 # if you want to enable microbatch overlap, you can uncomment the following lines
 #--enable_prefill_microbatch_overlap \
 #--enable_decode_microbatch_overlap \
diff --git a/test/start_scripts/single_pd_master/pd_decode.sh b/test/start_scripts/single_pd_master/pd_decode.sh
index 9601d5117..dac7a6dac 100644
--- a/test/start_scripts/single_pd_master/pd_decode.sh
+++ b/test/start_scripts/single_pd_master/pd_decode.sh
@@ -5,7 +5,7 @@
 export host=$1
 export pd_master_ip=$2
 nvidia-cuda-mps-control -d
-MOE_MODE=EP LOADWORKER=18 python -m lightllm.server.api_server \
+LOADWORKER=18 python -m lightllm.server.api_server \
 --model_dir /path/DeepSeek-R1 \
 --run_mode "decode" \
 --tp 8 \
@@ -13,6 +13,7 @@ MOE_MODE=EP LOADWORKER=18 python -m lightllm.server.api_server \
 --host $host \
 --port 8121 \
 --nccl_port 12322 \
+--enable_ep_moe \
 --pd_master_ip $pd_master_ip \
 --pd_master_port 60011 
 # if you want to enable microbatch overlap, you can uncomment the following lines
diff --git a/test/start_scripts/single_pd_master/pd_nixl_decode.sh b/test/start_scripts/single_pd_master/pd_nixl_decode.sh
index 931fee862..4b3fd0bc4 100644
--- a/test/start_scripts/single_pd_master/pd_nixl_decode.sh
+++ b/test/start_scripts/single_pd_master/pd_nixl_decode.sh
@@ -10,7 +10,7 @@ export UCX_LOG_LEVEL=info
 export UCX_TLS=rc,cuda,gdr_copy
 
 nvidia-cuda-mps-control -d
-MOE_MODE=EP LOADWORKER=18 python -m lightllm.server.api_server \
+LOADWORKER=18 python -m lightllm.server.api_server \
 --model_dir /path/DeepSeek-R1 \
 --run_mode "nixl_decode" \
 --tp 8 \
@@ -18,6 +18,7 @@ MOE_MODE=EP LOADWORKER=18 python -m lightllm.server.api_server \
 --host $host \
 --port 8121 \
 --nccl_port 12322 \
+--enable_ep_moe \
 --pd_master_ip $pd_master_ip \
 --pd_master_port 60011 
 # if you want to enable microbatch overlap, you can uncomment the following lines
diff --git a/test/start_scripts/single_pd_master/pd_nixl_prefill.sh b/test/start_scripts/single_pd_master/pd_nixl_prefill.sh
index 6363207cb..f415919f9 100644
--- a/test/start_scripts/single_pd_master/pd_nixl_prefill.sh
+++ b/test/start_scripts/single_pd_master/pd_nixl_prefill.sh
@@ -11,7 +11,7 @@ export UCX_TLS=rc,cuda,gdr_copy
 export host=$1
 export pd_master_ip=$2
 nvidia-cuda-mps-control -d 
-MOE_MODE=EP LOADWORKER=18 python -m lightllm.server.api_server \
+LOADWORKER=18 python -m lightllm.server.api_server \
 --model_dir /path/DeepSeek-R1 \
 --run_mode "nixl_prefill" \
 --tp 8 \
@@ -19,6 +19,7 @@ MOE_MODE=EP LOADWORKER=18 python -m lightllm.server.api_server \
 --host $host \
 --port 8019 \
 --nccl_port 2732 \
+--enable_ep_moe \
 --disable_cudagraph \
 --pd_master_ip $pd_master_ip \
 --pd_master_port 60011 
diff --git a/test/start_scripts/single_pd_master/pd_prefill.sh b/test/start_scripts/single_pd_master/pd_prefill.sh
index 0c1bd2659..6bde9ef32 100644
--- a/test/start_scripts/single_pd_master/pd_prefill.sh
+++ b/test/start_scripts/single_pd_master/pd_prefill.sh
@@ -5,7 +5,7 @@
 export host=$1
 export pd_master_ip=$2
 nvidia-cuda-mps-control -d 
-MOE_MODE=EP LOADWORKER=18 python -m lightllm.server.api_server \
+LOADWORKER=18 python -m lightllm.server.api_server \
 --model_dir /path/DeepSeek-R1 \
 --run_mode "prefill" \
 --tp 8 \
@@ -15,6 +15,7 @@ MOE_MODE=EP LOADWORKER=18 python -m lightllm.server.api_server \
 --nccl_port 2732 \
 --disable_cudagraph \
 --pd_master_ip $pd_master_ip \
---pd_master_port 60011 
+--pd_master_port 60011 \
+--enable_ep_moe
 # if you want to enable microbatch overlap, you can uncomment the following lines
 #--enable_prefill_microbatch_overlap
\ No newline at end of file

From b620c95fb136c236fba35b267ea75b5fd1964a4f Mon Sep 17 00:00:00 2001
From: shihaobai <1798930569@qq.com>
Date: Thu, 22 Jan 2026 17:33:55 +0000
Subject: [PATCH 26/65] redunancy_expert(draft)

---
 .../source/tutorial/deepseek_deployment.rst   |  35 +++---
 .../source/tutorial/deepseek_deployment.rst   |  35 +++---
 ...ight_ep_redundancy.py => ep_redundancy.py} |   0
 .../fused_moe/fused_moe_weight.py             | 110 +++++++++++++-----
 .../fused_moe/gpt_oss_fused_moe_weight_tp.py  |  74 ++----------
 .../meta_weights/fused_moe/impl/base_impl.py  |  19 ++-
 .../fused_moe/impl/triton_impl.py             |  14 ++-
 .../mode_backend/redundancy_expert_manager.py |   6 +-
 8 files changed, 156 insertions(+), 137 deletions(-)
 rename lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/{fused_moe_weight_ep_redundancy.py => ep_redundancy.py} (100%)

diff --git a/docs/CN/source/tutorial/deepseek_deployment.rst b/docs/CN/source/tutorial/deepseek_deployment.rst
index 2fc5d3e62..de7ecc84c 100644
--- a/docs/CN/source/tutorial/deepseek_deployment.rst
+++ b/docs/CN/source/tutorial/deepseek_deployment.rst
@@ -49,13 +49,14 @@ LightLLM 支持以下几种部署模式：
 .. code-block:: bash
 
     # H200 单机 DeepSeek-R1 DP + EP 模式
-    MOE_MODE=EP LOADWORKER=18 python -m lightllm.server.api_server --port 8088 \
+    LOADWORKER=18 python -m lightllm.server.api_server --port 8088 \
     --model_dir /path/DeepSeek-R1 \
     --tp 8 \
-    --dp 8
+    --dp 8 \
+    --enable_ep_moe
 
 **参数说明:**
-- `MOE_MODE=EP`: 设置专家并行模式
+- `--enable_ep_moe`: 设置专家并行模式
 - `--tp 8`: 张量并行度
 - `--dp 8`: 数据并行度，通常设置为与 tp 相同的值
 
@@ -119,14 +120,14 @@ LightLLM 支持以下几种部署模式：
     # H200 多机 DeepSeek-R1 EP 模式 Node 0
     # 使用方法: sh multi_node_ep_node0.sh <nccl_host>
     export nccl_host=$1
-    MOE_MODE=EP LOADWORKER=18 python -m lightllm.server.api_server --port 8088 \
+    LOADWORKER=18 python -m lightllm.server.api_server --port 8088 \
     --model_dir /path/DeepSeek-R1 \
     --tp 16 \
     --dp 16 \
     --nnodes 2 \
     --node_rank 0 \
     --nccl_host $nccl_host \
-    --nccl_port 2732
+    --nccl_port 2732 --enable_ep_moe
 
 **Node 1 启动命令:**
 
@@ -135,14 +136,14 @@ LightLLM 支持以下几种部署模式：
     # H200 多机 DeepSeek-R1 EP 模式 Node 1
     # 使用方法: sh multi_node_ep_node1.sh <nccl_host>
     export nccl_host=$1
-    MOE_MODE=EP LOADWORKER=18 python -m lightllm.server.api_server --port 8088 \
+    LOADWORKER=18 python -m lightllm.server.api_server --port 8088 \
     --model_dir /path/DeepSeek-R1 \
     --tp 16 \
     --dp 16 \
     --nnodes 2 \
     --node_rank 1 \
     --nccl_host $nccl_host \
-    --nccl_port 2732
+    --nccl_port 2732 --enable_ep_moe
 
 **可选优化参数:**
 - `--enable_prefill_microbatch_overlap`: 启用预填充微批次重叠
@@ -179,7 +180,7 @@ PD (Prefill-Decode) 分离模式将预填充和解码阶段分离部署，可以
     export host=$1
     export pd_master_ip=$2
     nvidia-cuda-mps-control -d 
-    MOE_MODE=EP LOADWORKER=18 python -m lightllm.server.api_server \
+    LOADWORKER=18 python -m lightllm.server.api_server \
     --model_dir /path/DeepSeek-R1 \
     --run_mode "prefill" \
     --tp 8 \
@@ -189,7 +190,8 @@ PD (Prefill-Decode) 分离模式将预填充和解码阶段分离部署，可以
     --nccl_port 2732 \
     --disable_cudagraph \
     --pd_master_ip $pd_master_ip \
-    --pd_master_port 60011
+    --pd_master_port 60011 \
+    --enable_ep_moe
     # 如果需要启用微批次重叠，可以取消注释以下行
     #--enable_prefill_microbatch_overlap
 
@@ -202,7 +204,7 @@ PD (Prefill-Decode) 分离模式将预填充和解码阶段分离部署，可以
     export host=$1
     export pd_master_ip=$2
     nvidia-cuda-mps-control -d
-    MOE_MODE=EP LOADWORKER=18 python -m lightllm.server.api_server \
+    LOADWORKER=18 python -m lightllm.server.api_server \
     --model_dir /path/DeepSeek-R1 \
     --run_mode "decode" \
     --tp 8 \
@@ -212,7 +214,8 @@ PD (Prefill-Decode) 分离模式将预填充和解码阶段分离部署，可以
     --nccl_port 12322 \
     --disable_cudagraph \
     --pd_master_ip $pd_master_ip \
-    --pd_master_port 60011
+    --pd_master_port 60011 \
+    --enable_ep_moe
     # 如果需要启用微批次重叠，可以取消注释以下行
     #--enable_decode_microbatch_overlap
 
@@ -269,7 +272,7 @@ PD (Prefill-Decode) 分离模式将预填充和解码阶段分离部署，可以
     export host=$1
     export config_server_host=$2
     nvidia-cuda-mps-control -d
-    MOE_MODE=EP LOADWORKER=18 python -m lightllm.server.api_server \
+    LOADWORKER=18 python -m lightllm.server.api_server \
     --model_dir /path/DeepSeek-R1 \
     --run_mode "prefill" \
     --host $host \
@@ -279,7 +282,8 @@ PD (Prefill-Decode) 分离模式将预填充和解码阶段分离部署，可以
     --nccl_port 2732 \
     --disable_cudagraph \
     --config_server_host $config_server_host \
-    --config_server_port 60088
+    --config_server_port 60088 \
+    --enable_ep_moe
     # 如果需要启用微批次重叠，可以取消注释以下行
     #--enable_prefill_microbatch_overlap
 
@@ -287,7 +291,7 @@ PD (Prefill-Decode) 分离模式将预填充和解码阶段分离部署，可以
     export host=$1
     export config_server_host=$2
     nvidia-cuda-mps-control -d
-    MOE_MODE=EP LOADWORKER=18 python -m lightllm.server.api_server \
+    LOADWORKER=18 python -m lightllm.server.api_server \
     --model_dir /path/DeepSeek-R1 \
     --run_mode "decode" \
     --host $host \
@@ -296,7 +300,8 @@ PD (Prefill-Decode) 分离模式将预填充和解码阶段分离部署，可以
     --tp 8 \
     --dp 8 \
     --config_server_host $config_server_host \
-    --config_server_port 60088
+    --config_server_port 60088 \
+    --enable_ep_moe
     # 如果需要启用微批次重叠，可以取消注释以下行
     #--enable_decode_microbatch_overlap
 
diff --git a/docs/EN/source/tutorial/deepseek_deployment.rst b/docs/EN/source/tutorial/deepseek_deployment.rst
index 280a61ceb..4c5a121dd 100755
--- a/docs/EN/source/tutorial/deepseek_deployment.rst
+++ b/docs/EN/source/tutorial/deepseek_deployment.rst
@@ -49,13 +49,14 @@ Suitable for expert parallelism deployment of MoE models like DeepSeek-V2/V3.
 .. code-block:: bash
 
     # H200 Single node DeepSeek-R1 DP + EP Mode
-    MOE_MODE=EP LOADWORKER=18 python -m lightllm.server.api_server --port 8088 \
+    LOADWORKER=18 python -m lightllm.server.api_server --port 8088 \
     --model_dir /path/DeepSeek-R1 \
     --tp 8 \
-    --dp 8
+    --dp 8 \
+    --enable_ep_moe
 
 **Parameter Description:**
-- `MOE_MODE=EP`: Set expert parallelism mode
+- `--enable_ep_moe`: Set expert parallelism mode
 - `--tp 8`: Tensor parallelism
 - `--dp 8`: Data parallelism, usually set to the same value as tp
 
@@ -119,14 +120,14 @@ Suitable for deploying MoE models across multiple nodes.
     # H200 Multi-node DeepSeek-R1 EP Mode Node 0
     # Usage: sh multi_node_ep_node0.sh <nccl_host>
     export nccl_host=$1
-    MOE_MODE=EP LOADWORKER=18 python -m lightllm.server.api_server --port 8088 \
+    LOADWORKER=18 python -m lightllm.server.api_server --port 8088 \
     --model_dir /path/DeepSeek-R1 \
     --tp 16 \
     --dp 16 \
     --nnodes 2 \
     --node_rank 0 \
     --nccl_host $nccl_host \
-    --nccl_port 2732
+    --nccl_port 2732 --enable_ep_moe
 
 **Node 1 Launch Command:**
 
@@ -135,14 +136,14 @@ Suitable for deploying MoE models across multiple nodes.
     # H200 Multi-node DeepSeek-R1 EP Mode Node 1
     # Usage: sh multi_node_ep_node1.sh <nccl_host>
     export nccl_host=$1
-    MOE_MODE=EP LOADWORKER=18 python -m lightllm.server.api_server --port 8088 \
+    LOADWORKER=18 python -m lightllm.server.api_server --port 8088 \
     --model_dir /path/DeepSeek-R1 \
     --tp 16 \
     --dp 16 \
     --nnodes 2 \
     --node_rank 1 \
     --nccl_host $nccl_host \
-    --nccl_port 2732
+    --nccl_port 2732 --enable_ep_moe
 
 **Optional Optimization Parameters:**
 - `--enable_prefill_microbatch_overlap`: Enable prefill microbatch overlap
@@ -179,7 +180,7 @@ PD (Prefill-Decode) disaggregation mode separates prefill and decode stages for
     export host=$1
     export pd_master_ip=$2
     nvidia-cuda-mps-control -d 
-    MOE_MODE=EP LOADWORKER=18 python -m lightllm.server.api_server \
+    LOADWORKER=18 python -m lightllm.server.api_server \
     --model_dir /path/DeepSeek-R1 \
     --run_mode "prefill" \
     --tp 8 \
@@ -188,7 +189,8 @@ PD (Prefill-Decode) disaggregation mode separates prefill and decode stages for
     --port 8019 \
     --nccl_port 2732 \
     --disable_cudagraph \
-    --pd_master_ip $pd_master_ip 
+    --pd_master_ip $pd_master_ip \
+    --enable_ep_moe
 
 **Step 3: Launch Decode Service**
 
@@ -199,7 +201,7 @@ PD (Prefill-Decode) disaggregation mode separates prefill and decode stages for
     export host=$1
     export pd_master_ip=$2
     nvidia-cuda-mps-control -d
-    MOE_MODE=EP LOADWORKER=18 python -m lightllm.server.api_server \
+    LOADWORKER=18 python -m lightllm.server.api_server \
     --model_dir /path/DeepSeek-R1 \
     --run_mode "decode" \
     --tp 8 \
@@ -209,7 +211,8 @@ PD (Prefill-Decode) disaggregation mode separates prefill and decode stages for
     --nccl_port 12322 \
     --disable_cudagraph \
     --pd_master_ip $pd_master_ip \
-    --pd_master_port 60011
+    --pd_master_port 60011 \
+    --enable_ep_moe
     # if you want to enable microbatch overlap, you can uncomment the following lines
     #--enable_decode_microbatch_overlap
 
@@ -266,7 +269,7 @@ Supports multiple PD Master nodes, providing better load balancing and high avai
     export host=$1
     export config_server_host=$2
     nvidia-cuda-mps-control -d
-    MOE_MODE=EP LOADWORKER=18 python -m lightllm.server.api_server \
+    LOADWORKER=18 python -m lightllm.server.api_server \
     --model_dir /path/DeepSeek-R1 \
     --run_mode "prefill" \
     --host $host \
@@ -276,7 +279,8 @@ Supports multiple PD Master nodes, providing better load balancing and high avai
     --nccl_port 2732 \
     --disable_cudagraph \
     --config_server_host $config_server_host \
-    --config_server_port 60088
+    --config_server_port 60088 \
+    --enable_ep_moe
     # if you want to enable microbatch overlap, you can uncomment the following lines
     #--enable_prefill_microbatch_overlap
 
@@ -284,7 +288,7 @@ Supports multiple PD Master nodes, providing better load balancing and high avai
     export host=$1
     export config_server_host=$2
     nvidia-cuda-mps-control -d
-    MOE_MODE=EP LOADWORKER=18 python -m lightllm.server.api_server \
+    LOADWORKER=18 python -m lightllm.server.api_server \
     --model_dir /path/DeepSeek-R1 \
     --run_mode "decode" \
     --host $host \
@@ -293,7 +297,8 @@ Supports multiple PD Master nodes, providing better load balancing and high avai
     --tp 8 \
     --dp 8 \
     --config_server_host $config_server_host \
-    --config_server_port 60088
+    --config_server_port 60088 \
+    --enable_ep_moe
     # if you want to enable microbatch overlap, you can uncomment the following lines
     #--enable_decode_microbatch_overlap
 
diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/fused_moe_weight_ep_redundancy.py b/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/ep_redundancy.py
similarity index 100%
rename from lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/fused_moe_weight_ep_redundancy.py
rename to lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/ep_redundancy.py
diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/fused_moe_weight.py b/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/fused_moe_weight.py
index 8b01f4643..ced1e9267 100644
--- a/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/fused_moe_weight.py
+++ b/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/fused_moe_weight.py
@@ -11,6 +11,9 @@
 from lightllm.common.quantization.quantize_method import QuantizationMethod
 from lightllm.utils.envs_utils import get_redundancy_expert_ids, get_redundancy_expert_num, get_env_start_args
 from lightllm.utils.dist_utils import get_global_world_size, get_global_rank
+from lightllm.utils.log_utils import init_logger
+
+logger = init_logger(__name__)
 
 
 class FusedMoeWeight(BaseWeightTpl):
@@ -53,13 +56,17 @@ def __init__(
         self.n_routed_experts = n_routed_experts
         self.num_fused_shared_experts = num_fused_shared_experts
         self._init_config(network_config)
+        self._init_redundancy_expert_params()
         self._init_parallel_params()
         self.fuse_moe_impl = select_fuse_moe_impl(self.quant_method, self.enable_ep_moe)(
             n_routed_experts=self.n_routed_experts,
             num_fused_shared_experts=self.num_fused_shared_experts,
-            redundancy_expert_num=self.redundancy_expert_num,
             routed_scaling_factor=self.routed_scaling_factor,
             quant_method=self.quant_method,
+            redundancy_expert_num=self.redundancy_expert_num,
+            redundancy_expert_ids_tensor=self.redundancy_expert_ids_tensor,
+            routed_expert_counter_tensor=self.routed_expert_counter_tensor,
+            auto_update_redundancy_expert=self.auto_update_redundancy_expert,
         )
         self.lock = threading.Lock()
         self._create_weight()
@@ -73,18 +80,40 @@ def _init_config(self, network_config: Dict[str, Any]):
         self.routed_scaling_factor = network_config.get("routed_scaling_factor", 1.0)
         self.scoring_func = network_config.get("scoring_func", "softmax")
 
+    def _init_redundancy_expert_params(self):
+        self.redundancy_expert_num = get_redundancy_expert_num()
+        self.redundancy_expert_ids = get_redundancy_expert_ids(self.layer_num_)
+        self.auto_update_redundancy_expert: bool = get_env_start_args().auto_update_redundancy_expert
+        self.redundancy_expert_ids_tensor = torch.tensor(self.redundancy_expert_ids, dtype=torch.int64, device="cuda")
+        self.routed_expert_counter_tensor = torch.zeros((self.n_routed_experts,), dtype=torch.int64, device="cuda")
+
     def _init_parallel_params(self):
         self.local_n_routed_experts = self.n_routed_experts + self.num_fused_shared_experts
-        self.start_expert_id = 0
         self.split_inter_size = self.moe_intermediate_size // self.tp_world_size_
-        self.redundancy_expert_num = 0
         if self.enable_ep_moe:
             assert self.num_fused_shared_experts == 0, "num_fused_shared_experts must be 0 when enable_ep_moe"
-            self.redundancy_expert_num = get_redundancy_expert_num()
-            self.redundancy_expert_ids = get_redundancy_expert_ids(self.layer_num_)
+            logger.info(
+                f"global_rank {self.global_rank_} layerindex {self.layer_num_} "
+                f"redundancy_expertids: {self.redundancy_expert_ids}"
+            )
             self.local_n_routed_experts = self.n_routed_experts // self.global_world_size + self.redundancy_expert_num
-            self.start_expert_id = self.global_rank_ * self.n_routed_experts // self.global_world_size
             self.split_inter_size = self.moe_intermediate_size
+            n_experts_per_rank = self.n_routed_experts // self.global_world_size
+            start_expert_id = self.global_rank_ * n_experts_per_rank
+            self.local_expert_ids = (
+                list(range(start_expert_id, start_expert_id + n_experts_per_rank)) + self.redundancy_expert_ids
+            )
+            self.expert_idx_to_local_idx = {
+                expert_idx: expert_idx - start_expert_id for expert_idx in self.local_expert_ids[:n_experts_per_rank]
+            }
+            self.redundancy_expert_idx_to_local_idx = {
+                redundancy_expert_idx: n_experts_per_rank + i
+                for (i, redundancy_expert_idx) in enumerate(self.redundancy_expert_ids)
+            }
+        else:
+            self.local_expert_ids = list(range(self.n_routed_experts + self.num_fused_shared_experts))
+            self.expert_idx_to_local_idx = {expert_idx: i for (i, expert_idx) in enumerate(self.local_expert_ids)}
+            self.rexpert_idx_to_local_idx = {}
 
     def experts(
         self,
@@ -229,25 +258,12 @@ def load_hf_weights(self, weights):
         # Load bias
         if self.e_score_correction_bias_name in weights:
             self.e_score_correction_bias.copy_(weights[self.e_score_correction_bias_name])
-
-        # Load each expert with TP slicing
-        for i_experts in range(self.start_expert_id, self.start_expert_id + self.local_n_routed_experts):
-            with self.lock:
-                self._load_expert(i_experts, weights, type="weight", suffix=self.quant_method.weight_suffix)
-            if self.w13.weight_scale is not None:
-                with self.lock:
-                    self._load_expert(
-                        i_experts, weights, type="weight_scale", suffix=self.quant_method.weight_scale_suffix
-                    )
-            if self.w13.weight_zero_point is not None:
-                with self.lock:
-                    self._load_expert(
-                        i_experts, weights, type="weight_zero_point", suffix=self.quant_method.weight_zero_point_suffix
-                    )
+        self._load_weight(self.expert_idx_to_local_idx, weights)
+        if self.redundancy_expert_num > 0:
+            self._load_weight(self.redundancy_expert_idx_to_local_idx, weights)
 
     def verify_load(self):
         return True
-        return self.load_cnt == self.n_routed_experts * 3 * 2
 
     def _create_weight(self):
         intermediate_size = self.split_inter_size
@@ -276,31 +292,61 @@ def _create_weight(self):
         )
         self.load_cnt = 0
 
-    def _load_weight_func(self, weight: torch.Tensor, weight_pack: WeightPack, start_idx: int = 0):
-        if self.quant_method.weight_need_quanted(weight):
-            self.quant_method.quantize(weight, weight_pack, start_idx)
-        else:
-            self.quant_method.load_weight(weight, weight_pack, start_idx)
+    def _load_weight(self, expert_idx_to_local_idx: Dict[int, int], weights: Dict[str, torch.Tensor]):
 
-    def _load_expert(self, expert_idx, weights, type: str, suffix: str = "weight"):
+        # Load each expert with TP slicing
+        for expert_idx, local_expert_idx in expert_idx_to_local_idx.items():
+            with self.lock:
+                self._load_expert(
+                    expert_idx, local_expert_idx, weights, type="weight", suffix=self.quant_method.weight_suffix
+                )
+            if self.w13.weight_scale is not None:
+                with self.lock:
+                    self._load_expert(
+                        expert_idx,
+                        local_expert_idx,
+                        weights,
+                        type="weight_scale",
+                        suffix=self.quant_method.weight_scale_suffix,
+                    )
+            if self.w13.weight_zero_point is not None:
+                with self.lock:
+                    self._load_expert(
+                        expert_idx,
+                        local_expert_idx,
+                        weights,
+                        type="weight_zero_point",
+                        suffix=self.quant_method.weight_zero_point_suffix,
+                    )
+
+    def _load_expert(
+        self,
+        expert_idx: int,
+        local_expert_idx: int,
+        weights: Dict[str, torch.Tensor],
+        type: str,
+        suffix: str = "weight",
+    ):
         w1_weight = f"{self.weight_prefix}.{expert_idx}.{self.w1_weight_name}.{suffix}"
         w2_weight = f"{self.weight_prefix}.{expert_idx}.{self.w2_weight_name}.{suffix}"
         w3_weight = f"{self.weight_prefix}.{expert_idx}.{self.w3_weight_name}.{suffix}"
         intermediate_size = self.split_inter_size
         load_func, slice_func = self._get_load_and_slice_func(type, is_row=True)
-        local_expert_idx = expert_idx - self.start_expert_id
         if w1_weight in weights:
             load_func(slice_func(weights[w1_weight]), self.w13.get_expert(local_expert_idx), start_idx=0)
-            self.load_cnt += 1
         if w3_weight in weights:
             load_func(
                 slice_func(weights[w3_weight]), self.w13.get_expert(local_expert_idx), start_idx=intermediate_size
             )
-            self.load_cnt += 1
         load_func, slice_func = self._get_load_and_slice_func(type, is_row=False)
         if w2_weight in weights:
             load_func(slice_func(weights[w2_weight]), self.w2.get_expert(local_expert_idx), start_idx=0)
-            self.load_cnt += 1
+
+    def _load_weight_func(self, weight: torch.Tensor, weight_pack: WeightPack, start_idx: int = 0):
+        if self.quant_method.weight_need_quanted(weight):
+            self.quant_method.quantize(weight, weight_pack, start_idx)
+        else:
+            self.quant_method.load_weight(weight, weight_pack, start_idx)
 
     def _get_load_and_slice_func(self, type: str, is_row: bool = True):
         if is_row:
diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/gpt_oss_fused_moe_weight_tp.py b/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/gpt_oss_fused_moe_weight_tp.py
index f3f153b0a..e7748b1df 100644
--- a/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/gpt_oss_fused_moe_weight_tp.py
+++ b/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/gpt_oss_fused_moe_weight_tp.py
@@ -1,9 +1,12 @@
+import os
 import torch
-from typing import Dict, Any
+import threading
+from typing import Optional, Tuple, List, Dict, Any
 
 from lightllm.common.basemodel.layer_weights.meta_weights.fused_moe.fused_moe_weight import FusedMoeWeight
+from lightllm.utils.dist_utils import get_current_rank_in_dp, get_current_device_id
+from lightllm.common.quantization import Quantcfg
 from lightllm.utils.log_utils import init_logger
-from lightllm.common.quantization.quantize_method import QuantizationMethod
 
 logger = init_logger(__name__)
 
@@ -41,7 +44,7 @@ def __init__(
         network_config: Dict[str, Any],
         layer_num: int,
         world_size: int = 1,  # diff with FusedMoeWeightTP
-        quant_method: QuantizationMethod = None,
+        quant_cfg: Quantcfg = None,
     ) -> None:
         super().__init__(
             gate_up_proj_name,
@@ -55,7 +58,7 @@ def __init__(
             data_type,
             network_config,
             layer_num,
-            quant_method,
+            quant_cfg,
         )
         self.hidden_size = network_config["hidden_size"]
 
@@ -118,56 +121,7 @@ def router(self, router_logits, top_k):
         router_top_value = torch.nn.functional.softmax(router_top_value, dim=1, dtype=router_top_value.dtype)
         return router_top_value, router_indices
 
-    def _native_forward(
-        self, input_tensor, router_logits, top_k, renormalize, use_grouped_topk, topk_group, num_expert_group
-    ):
-        """PyTorch native implementation for GPT-OSS MoE forward pass."""
-        topk_weights, topk_ids = self.router(router_logits, top_k)
-
-        w1, w1_scale = self.w1
-        w2, w2_scale = self.w2
-
-        batch_size, hidden_size = input_tensor.shape
-
-        output = torch.zeros_like(input_tensor)
-        input_bf16 = input_tensor.to(torch.bfloat16)
-
-        for i in range(batch_size):
-            expert_output = torch.zeros(hidden_size, dtype=torch.bfloat16, device=input_tensor.device)
-            for j in range(top_k):
-                expert_idx = topk_ids[i, j].item()
-                weight = topk_weights[i, j]
-
-                w1_expert = w1[expert_idx]
-                w2_expert = w2[expert_idx]
-
-                x = input_bf16[i : i + 1]
-                hidden = torch.mm(x, w1_expert.T)  # [1, intermediate_size * 2]
-                if self.w1_bias is not None:
-                    hidden = hidden + self.w1_bias[expert_idx : expert_idx + 1]
-
-                gate = hidden[:, 0::2]
-                up = hidden[:, 1::2]
-
-                gate = torch.clamp(gate * self.alpha, -self.limit, self.limit)
-                gate = torch.nn.functional.sigmoid(gate)
-                hidden = gate * up
-
-                expert_out = torch.mm(hidden, w2_expert.T)
-                if self.w2_bias is not None:
-                    expert_out = expert_out + self.w2_bias[expert_idx : expert_idx + 1] / self.tp_world_size_
-
-                expert_output += weight * expert_out.squeeze(0)
-
-            output[i] = expert_output
-
-        input_tensor.copy_(output.to(input_tensor.dtype))
-        return output
-
-    def _cuda_forward(
-        self, input_tensor, router_logits, top_k, renormalize, use_grouped_topk, topk_group, num_expert_group
-    ):
-        """CUDA optimized implementation for GPT-OSS MoE forward pass."""
+    def experts(self, input_tensor, router_logits, top_k, renormalize, use_grouped_topk, topk_group, num_expert_group):
         topk_weights, topk_ids = self.router(router_logits, top_k)
 
         w1, w1_scale = self.w1
@@ -194,18 +148,6 @@ def _cuda_forward(
         )
         return output_tensor
 
-    def experts(self, input_tensor, router_logits, top_k, renormalize, use_grouped_topk, topk_group, num_expert_group):
-        """Backward compatible method that routes to platform-specific implementation."""
-        return self._forward(
-            input_tensor=input_tensor,
-            router_logits=router_logits,
-            top_k=top_k,
-            renormalize=renormalize,
-            use_grouped_topk=use_grouped_topk,
-            topk_group=topk_group,
-            num_expert_group=num_expert_group,
-        )
-
     def _convert_moe_packed_tensors(
         self,
         blocks,
diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/impl/base_impl.py b/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/impl/base_impl.py
index 2f5d169eb..c56cd4da3 100644
--- a/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/impl/base_impl.py
+++ b/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/impl/base_impl.py
@@ -16,20 +16,31 @@ def __init__(
         self,
         n_routed_experts: int,
         num_fused_shared_experts: int,
-        redundancy_expert_num: int,
         routed_scaling_factor: float,
         quant_method: QuantizationMethod,
+        redundancy_expert_num: int,
+        redundancy_expert_ids_tensor: torch.Tensor,
+        routed_expert_counter_tensor: torch.Tensor,
+        auto_update_redundancy_expert: bool,
     ):
         self.n_routed_experts = n_routed_experts
         self.num_fused_shared_experts = num_fused_shared_experts
-        self.redundancy_expert_num = redundancy_expert_num
         self.routed_scaling_factor = routed_scaling_factor
         self.quant_method = quant_method
         self.global_rank_ = get_global_rank()
-        self.global_world_size = get_global_world_size()
+        self.global_world_size_ = get_global_world_size()
+        self.ep_n_routed_experts = self.n_routed_experts // self.global_world_size_
         self.total_expert_num_contain_redundancy = (
-            self.n_routed_experts + self.redundancy_expert_num * self.global_world_size
+            self.n_routed_experts + redundancy_expert_num * self.global_world_size_
         )
+
+        # redundancy expert related
+        self.redundancy_expert_num = redundancy_expert_num
+        self.redundancy_expert_ids_tensor = redundancy_expert_ids_tensor
+        self.routed_expert_counter_tensor = routed_expert_counter_tensor
+        self.auto_update_redundancy_expert = auto_update_redundancy_expert
+
+        # workspace for kernel optimization
         self.workspace = self.create_workspace()
 
     @abstractmethod
diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/impl/triton_impl.py b/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/impl/triton_impl.py
index 9965246a2..8bcdb4bf9 100644
--- a/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/impl/triton_impl.py
+++ b/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/impl/triton_impl.py
@@ -10,12 +10,22 @@ def __init__(
         self,
         n_routed_experts: int,
         num_fused_shared_experts: int,
-        redundancy_expert_num: int,
         routed_scaling_factor: float,
         quant_method: QuantizationMethod,
+        redundancy_expert_num: int,
+        redundancy_expert_ids_tensor: torch.Tensor,
+        routed_expert_counter_tensor: torch.Tensor,
+        auto_update_redundancy_expert: bool,
     ):
         super().__init__(
-            n_routed_experts, num_fused_shared_experts, redundancy_expert_num, routed_scaling_factor, quant_method
+            n_routed_experts=n_routed_experts,
+            num_fused_shared_experts=num_fused_shared_experts,
+            routed_scaling_factor=routed_scaling_factor,
+            quant_method=quant_method,
+            redundancy_expert_num=redundancy_expert_num,
+            redundancy_expert_ids_tensor=redundancy_expert_ids_tensor,
+            routed_expert_counter_tensor=routed_expert_counter_tensor,
+            auto_update_redundancy_expert=auto_update_redundancy_expert,
         )
 
     def create_workspace(self):
diff --git a/lightllm/server/router/model_infer/mode_backend/redundancy_expert_manager.py b/lightllm/server/router/model_infer/mode_backend/redundancy_expert_manager.py
index e3a71379d..596eca4f2 100644
--- a/lightllm/server/router/model_infer/mode_backend/redundancy_expert_manager.py
+++ b/lightllm/server/router/model_infer/mode_backend/redundancy_expert_manager.py
@@ -8,10 +8,10 @@
 import json
 from typing import List
 from lightllm.common.basemodel.basemodel import TpPartBaseModel
-from lightllm.common.basemodel.layer_weights.meta_weights.fused_moe.fused_moe_weight_ep_redundancy import (
+from lightllm.common.basemodel.layer_weights.meta_weights.fused_moe.ep_redundancy import (
     FusedMoeWeightEPAutoRedundancy,
 )
-from lightllm.common.basemodel.layer_weights.meta_weights.fused_moe.fused_moe_weight_ep import FusedMoeWeightEP
+from lightllm.common.basemodel.layer_weights.meta_weights.fused_moe.fused_moe_weight import FusedMoeWeight
 from lightllm.utils.envs_utils import get_env_start_args, get_redundancy_expert_update_interval
 from lightllm.utils.envs_utils import get_redundancy_expert_update_max_load_count
 from lightllm.utils.envs_utils import get_redundancy_expert_num
@@ -28,7 +28,7 @@ def __init__(self, model: TpPartBaseModel):
         self.model = model
         self.ep_fused_moeweights: List[FusedMoeWeightEPAutoRedundancy] = []
         for layer in self.model.trans_layers_weight:
-            ep_weights = self._find_members_of_class(layer, FusedMoeWeightEP)
+            ep_weights = self._find_members_of_class(layer, FusedMoeWeight)
             assert len(ep_weights) <= 1
             self.ep_fused_moeweights.extend([FusedMoeWeightEPAutoRedundancy(e) for e in ep_weights])
 

From 461a7ad9953d558036e7387bc8c7f5545a848569 Mon Sep 17 00:00:00 2001
From: shihaobai <1798930569@qq.com>
Date: Thu, 22 Jan 2026 17:43:03 +0000
Subject: [PATCH 27/65] remove weight_ep

---
 .../layer_weights/meta_weights/__init__.py    |   1 -
 .../fused_moe/fused_moe_weight_ep.py          | 692 ------------------
 2 files changed, 693 deletions(-)
 delete mode 100644 lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/fused_moe_weight_ep.py

diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/__init__.py b/lightllm/common/basemodel/layer_weights/meta_weights/__init__.py
index ab0e5b604..8e884012d 100644
--- a/lightllm/common/basemodel/layer_weights/meta_weights/__init__.py
+++ b/lightllm/common/basemodel/layer_weights/meta_weights/__init__.py
@@ -9,5 +9,4 @@
 from .norm_weight import TpRMSNormWeight, RMSNormWeight, LayerNormWeight, NoTpGEMMANormWeight, QKRMSNORMWeight
 from .embedding_weight import EmbeddingWeight, LMHeadWeight, NoTpPosEmbeddingWeight
 from .att_sink_weight import TpAttSinkWeight
-from .fused_moe.fused_moe_weight_ep import FusedMoeWeightEP
 from .fused_moe.fused_moe_weight import FusedMoeWeight
diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/fused_moe_weight_ep.py b/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/fused_moe_weight_ep.py
deleted file mode 100644
index 6659a98d4..000000000
--- a/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/fused_moe_weight_ep.py
+++ /dev/null
@@ -1,692 +0,0 @@
-import torch
-import threading
-from typing import Optional, Tuple, List, Dict, Any
-from lightllm.utils.dist_utils import get_global_world_size, get_global_rank
-from lightllm.common.basemodel.layer_weights.meta_weights.base_weight import BaseWeightTpl
-from lightllm.common.basemodel.layer_weights.meta_weights.platform_op import PlatformAwareOp
-from lightllm.common.basemodel.triton_kernel.fused_moe.grouped_fused_moe_ep import (
-    fused_experts_impl,
-    masked_group_gemm,
-    _deepgemm_grouped_fp8_nt_contiguous,
-)
-from lightllm.common.basemodel.triton_kernel.fused_moe.moe_silu_and_mul import silu_and_mul_fwd
-from lightllm.distributed import dist_group_manager
-from lightllm.common.basemodel.triton_kernel.fused_moe.topk_select import select_experts
-from lightllm.utils.envs_utils import get_deepep_num_max_dispatch_tokens_per_rank
-from lightllm.utils.envs_utils import get_redundancy_expert_ids, get_redundancy_expert_num
-from lightllm.utils.envs_utils import get_env_start_args
-from lightllm.common.basemodel.triton_kernel.quantization.fp8act_quant_kernel import (
-    per_token_group_quant_fp8,
-    tma_align_input_scale,
-)
-from lightllm.common.basemodel.triton_kernel.fused_moe.deepep_scatter_gather import ep_scatter, ep_gather
-from lightllm.common.basemodel.triton_kernel.redundancy_topk_ids_repair import redundancy_topk_ids_repair
-from lightllm.utils.log_utils import init_logger
-from lightllm.common.triton_utils.autotuner import Autotuner
-from lightllm.common.quantization.quantize_method import WeightPack
-
-
-logger = init_logger(__name__)
-
-
-class FusedMoeWeightEP(BaseWeightTpl, PlatformAwareOp):
-    def __init__(
-        self,
-        gate_proj_name: str,
-        down_proj_name: str,
-        up_proj_name: str,
-        e_score_correction_bias_name: str,
-        weight_prefix: str,
-        n_routed_experts: int,
-        data_type: torch.dtype,
-        network_config: Dict[str, Any],
-        layer_num: int,
-        quant_cfg=None,
-        hidden_size: Optional[int] = None,
-    ) -> None:
-        super().__init__()
-
-        self.layer_num = layer_num
-        self.quant_method = quant_cfg.get_quant_method(layer_num, "fused_moe")
-        self.quantized_weight = quant_cfg.quantized_weight
-        if self.quant_method is not None:
-            self.weight_scale_suffix = self.quant_method.weight_scale_suffix
-            self.quant_method.is_moe = True
-            block_size = 1
-            if hasattr(self.quant_method, "block_size"):
-                block_size = self.quant_method.block_size
-            self.block_size = block_size
-
-        self.weight_prefix = weight_prefix
-        self.w1_weight_name = gate_proj_name
-        self.w2_weight_name = down_proj_name
-        self.w3_weight_name = up_proj_name
-        self.e_score_correction_bias_name = e_score_correction_bias_name
-        self.n_routed_experts = n_routed_experts
-        self.hidden_size = hidden_size
-
-        global_world_size = get_global_world_size()
-        self.global_rank_ = get_global_rank()
-        self.redundancy_expert_num = get_redundancy_expert_num()
-        self.redundancy_expert_ids = get_redundancy_expert_ids(layer_num)
-        logger.info(
-            f"global_rank {self.global_rank_} layerindex {layer_num} redundancy_expertids: {self.redundancy_expert_ids}"
-        )
-        self.redundancy_expert_ids_tensor = torch.tensor(self.redundancy_expert_ids, dtype=torch.int64, device="cuda")
-        self.routed_expert_counter_tensor = torch.zeros((self.n_routed_experts,), dtype=torch.int64, device="cuda")
-        self.total_expert_num_contain_redundancy = (
-            self.n_routed_experts + self.redundancy_expert_num * global_world_size
-        )
-        assert self.n_routed_experts % global_world_size == 0
-        self.ep_n_routed_experts = self.n_routed_experts // global_world_size
-        ep_load_expert_num = self.ep_n_routed_experts + self.redundancy_expert_num
-        self.ep_load_expert_num = ep_load_expert_num
-        self.experts_up_projs = [None] * ep_load_expert_num
-        self.experts_gate_projs = [None] * ep_load_expert_num
-        self.experts_up_proj_scales = [None] * ep_load_expert_num
-        self.experts_gate_proj_scales = [None] * ep_load_expert_num
-        self.e_score_correction_bias = None
-        self.w2_list = [None] * ep_load_expert_num
-        self.w2_scale_list = [None] * ep_load_expert_num
-        self.scoring_func = network_config.get("scoring_func", "softmax")
-        self.w1 = [None, None]  # weight, weight_scale
-        self.w2 = [None, None]  # weight, weight_scale
-        self.use_fp8_w8a8 = self.quant_method is not None
-        network_config["n_group"] = network_config.get("n_group", 0)
-        self.num_experts_per_tok = network_config["num_experts_per_tok"]
-        self.use_grouped_topk = network_config["n_group"] > 0
-        self.norm_topk_prob = network_config["norm_topk_prob"]
-        self.n_group = network_config["n_group"]
-        network_config["topk_group"] = network_config.get("topk_group", 0)
-        self.topk_group = network_config["topk_group"]
-        network_config["routed_scaling_factor"] = network_config.get("routed_scaling_factor", 1.0)
-        self.routed_scaling_factor = network_config["routed_scaling_factor"]
-
-        self.lock = threading.Lock()
-        # init buffer
-
-        # auto update redundancy expert vars
-        self.auto_update_redundancy_expert: bool = get_env_start_args().auto_update_redundancy_expert
-
-        # Pre-allocate memory if hidden_size is provided
-        if self.hidden_size is not None:
-            self._create_weight()
-
-    def _create_weight(self):
-        """Pre-allocate GPU memory for fused MoE weights"""
-        if self.hidden_size is None:
-            return
-
-        total_expert_num = self.ep_load_expert_num
-        # We need to determine intermediate size from network config or use a default
-        # This will be updated when first weight is loaded if needed
-        intermediate_size = getattr(self, "intermediate_size", None)
-        if intermediate_size is None:
-            # Default fallback - this will be corrected during load
-            intermediate_size = self.hidden_size * 4
-
-        if not self.quantized_weight and self.quant_method is not None:
-            # Quantized weights
-            w1_pack = self.quant_method.create_weight(
-                total_expert_num * intermediate_size * 2,
-                self.hidden_size,
-                dtype=self.data_type_,
-                device_id=self.device_id_,
-            )
-            self.w1[0] = w1_pack.weight.view(total_expert_num, intermediate_size * 2, self.hidden_size)
-            self.w1[1] = w1_pack.weight_scale.view(total_expert_num, intermediate_size * 2, self.hidden_size)
-
-            w2_pack = self.quant_method.create_weight(
-                total_expert_num * self.hidden_size,
-                intermediate_size,
-                dtype=self.data_type_,
-                device_id=self.device_id_,
-            )
-            self.w2[0] = w2_pack.weight.view(total_expert_num, self.hidden_size, intermediate_size)
-            self.w2[1] = w2_pack.weight_scale.view(total_expert_num, self.hidden_size, intermediate_size)
-        else:
-            # Regular weights
-            self.w1[0] = torch.empty(
-                (total_expert_num, intermediate_size * 2, self.hidden_size),
-                dtype=self.data_type_,
-                device=f"cuda:{self.device_id_}",
-            )
-            self.w2[0] = torch.empty(
-                (total_expert_num, self.hidden_size, intermediate_size),
-                dtype=self.data_type_,
-                device=f"cuda:{self.device_id_}",
-            )
-
-    def _select_experts(
-        self, input_tensor, router_logits, top_k, renormalize, use_grouped_topk, topk_group, num_expert_group
-    ):
-        """Select experts and return topk weights and ids."""
-        topk_weights, topk_ids = select_experts(
-            hidden_states=input_tensor,
-            router_logits=router_logits,
-            correction_bias=self.e_score_correction_bias,
-            use_grouped_topk=use_grouped_topk,
-            top_k=top_k,
-            renormalize=renormalize,
-            topk_group=topk_group,
-            num_expert_group=num_expert_group,
-            scoring_func=self.scoring_func,
-        )
-        topk_weights.mul_(self.routed_scaling_factor)
-
-        if self.redundancy_expert_num > 0:
-            redundancy_topk_ids_repair(
-                topk_ids=topk_ids,
-                redundancy_expert_ids=self.redundancy_expert_ids_tensor,
-                ep_expert_num=self.ep_n_routed_experts,
-                global_rank=self.global_rank_,
-                expert_counter=self.routed_expert_counter_tensor,
-                enable_counter=self.auto_update_redundancy_expert,
-            )
-        return topk_weights, topk_ids
-
-    def _cuda_forward(
-        self,
-        input_tensor,
-        router_logits,
-        top_k,
-        renormalize,
-        use_grouped_topk,
-        topk_group,
-        num_expert_group,
-        is_prefill,
-    ):
-        """CUDA optimized implementation for EP MoE forward pass."""
-        topk_weights, topk_ids = self._select_experts(
-            input_tensor, router_logits, top_k, renormalize, use_grouped_topk, topk_group, num_expert_group
-        )
-
-        w1, w1_scale = self.w1
-        w2, w2_scale = self.w2
-        return fused_experts_impl(
-            hidden_states=input_tensor,
-            w1=w1,
-            w2=w2,
-            topk_weights=topk_weights,
-            topk_idx=topk_ids.to(torch.long),
-            num_experts=self.total_expert_num_contain_redundancy,  # number of all experts contain redundancy
-            buffer=dist_group_manager.ep_buffer,
-            is_prefill=is_prefill,
-            use_fp8_w8a8=self.use_fp8_w8a8,
-            use_fp8_all2all=self.use_fp8_w8a8,
-            use_int8_w8a16=False,  # default to False
-            w1_scale=w1_scale,
-            w2_scale=w2_scale,
-            previous_event=None,  # for overlap
-        )
-
-    def experts(
-        self,
-        input_tensor,
-        router_logits,
-        top_k,
-        renormalize,
-        use_grouped_topk,
-        topk_group,
-        num_expert_group,
-        is_prefill,
-    ):
-        """Backward compatible method that routes to platform-specific implementation."""
-        return self._forward(
-            input_tensor=input_tensor,
-            router_logits=router_logits,
-            top_k=top_k,
-            renormalize=renormalize,
-            use_grouped_topk=use_grouped_topk,
-            topk_group=topk_group,
-            num_expert_group=num_expert_group,
-            is_prefill=is_prefill,
-        )
-
-    def low_latency_dispatch(
-        self,
-        hidden_states: torch.Tensor,
-        router_logits: torch.Tensor,
-    ):
-
-        topk_weights, topk_idx = select_experts(
-            hidden_states=hidden_states,
-            router_logits=router_logits,
-            correction_bias=self.e_score_correction_bias,
-            use_grouped_topk=self.use_grouped_topk,
-            top_k=self.num_experts_per_tok,
-            renormalize=self.norm_topk_prob,
-            topk_group=self.topk_group,
-            num_expert_group=self.n_group,
-            scoring_func=self.scoring_func,
-        )
-        topk_weights.mul_(self.routed_scaling_factor)
-
-        if self.redundancy_expert_num > 0:
-            redundancy_topk_ids_repair(
-                topk_ids=topk_idx,
-                redundancy_expert_ids=self.redundancy_expert_ids_tensor,
-                ep_expert_num=self.ep_n_routed_experts,
-                global_rank=self.global_rank_,
-                expert_counter=self.routed_expert_counter_tensor,
-                enable_counter=self.auto_update_redundancy_expert,
-            )
-
-        topk_idx = topk_idx.to(torch.long)
-        num_max_dispatch_tokens_per_rank = get_deepep_num_max_dispatch_tokens_per_rank()
-        recv_x, masked_m, handle, event, hook = dist_group_manager.ep_buffer.low_latency_dispatch(
-            hidden_states,
-            topk_idx,
-            num_max_dispatch_tokens_per_rank,
-            self.total_expert_num_contain_redundancy,
-            use_fp8=self.use_fp8_w8a8,
-            async_finish=False,
-            return_recv_hook=True,
-        )
-        return recv_x, masked_m, topk_idx, topk_weights, handle, hook
-
-    def select_experts_and_quant_input(
-        self,
-        hidden_states: torch.Tensor,
-        router_logits: torch.Tensor,
-    ):
-        topk_weights, topk_idx = select_experts(
-            hidden_states=hidden_states,
-            router_logits=router_logits,
-            correction_bias=self.e_score_correction_bias,
-            use_grouped_topk=self.use_grouped_topk,
-            top_k=self.num_experts_per_tok,
-            renormalize=self.norm_topk_prob,
-            topk_group=self.topk_group,
-            num_expert_group=self.n_group,
-            scoring_func=self.scoring_func,
-        )
-        topk_weights.mul_(self.routed_scaling_factor)
-        if self.redundancy_expert_num > 0:
-            redundancy_topk_ids_repair(
-                topk_ids=topk_idx,
-                redundancy_expert_ids=self.redundancy_expert_ids_tensor,
-                ep_expert_num=self.ep_n_routed_experts,
-                global_rank=self.global_rank_,
-                expert_counter=self.routed_expert_counter_tensor,
-                enable_counter=self.auto_update_redundancy_expert,
-            )
-        M, K = hidden_states.shape
-        w1, w1_scale = self.w1
-        block_size_k = 0
-        if w1.ndim == 3:
-            block_size_k = w1.shape[2] // w1_scale.shape[2]
-        assert block_size_k == 128, "block_size_k must be 128"
-        qinput_tensor, input_scale = per_token_group_quant_fp8(hidden_states, block_size_k, dtype=w1.dtype)
-        return topk_weights, topk_idx.to(torch.long), (qinput_tensor, input_scale)
-
-    def dispatch(
-        self,
-        qinput_tensor: Tuple[torch.Tensor],
-        topk_idx: torch.Tensor,
-        topk_weights: torch.Tensor,
-        overlap_event: Optional[Any] = None,
-    ):
-        buffer = dist_group_manager.ep_buffer
-        # get_dispatch_layout
-        (
-            num_tokens_per_rank,
-            num_tokens_per_rdma_rank,
-            num_tokens_per_expert,
-            is_token_in_rank,
-            previous_event,
-        ) = buffer.get_dispatch_layout(
-            topk_idx,
-            self.total_expert_num_contain_redundancy,
-            previous_event=overlap_event,
-            async_finish=True,
-            allocate_on_comm_stream=True,
-        )
-        recv_x, recv_topk_idx, recv_topk_weights, num_recv_tokens_per_expert_list, handle, event = buffer.dispatch(
-            qinput_tensor,
-            topk_idx=topk_idx,
-            topk_weights=topk_weights,
-            num_tokens_per_rank=num_tokens_per_rank,
-            num_tokens_per_rdma_rank=num_tokens_per_rdma_rank,
-            is_token_in_rank=is_token_in_rank,
-            num_tokens_per_expert=num_tokens_per_expert,
-            previous_event=previous_event,
-            async_finish=True,
-            allocate_on_comm_stream=True,
-            expert_alignment=128,
-        )
-
-        def hook():
-            event.current_stream_wait()
-
-        return recv_x, recv_topk_idx, recv_topk_weights, num_recv_tokens_per_expert_list, handle, hook
-
-    def masked_group_gemm(
-        self, recv_x: Tuple[torch.Tensor], masked_m: torch.Tensor, dtype: torch.dtype, expected_m: int
-    ):
-        w1, w1_scale = self.w1
-        w2, w2_scale = self.w2
-        return masked_group_gemm(recv_x, masked_m, dtype, w1, w1_scale, w2, w2_scale, expected_m=expected_m)
-
-    def prefilled_group_gemm(
-        self,
-        num_recv_tokens_per_expert_list,
-        recv_x: Tuple[torch.Tensor],
-        recv_topk_idx: torch.Tensor,
-        recv_topk_weights: torch.Tensor,
-        hidden_dtype=torch.bfloat16,
-    ):
-        device = recv_x[0].device
-        w1, w1_scale = self.w1
-        w2, w2_scale = self.w2
-        _, K = recv_x[0].shape
-        _, N, _ = w1.shape
-        # scatter
-        all_tokens = sum(num_recv_tokens_per_expert_list)  # calcu padding all nums.
-        # gather_out shape [recive_num_tokens, hidden]
-        gather_out = torch.empty_like(recv_x[0], device=device, dtype=hidden_dtype)
-        if all_tokens > 0:
-            input_tensor = [
-                torch.empty((all_tokens, K), device=device, dtype=recv_x[0].dtype),
-                torch.empty((all_tokens, K // 128), device=device, dtype=torch.float32),
-            ]
-            # when m_indices is filled ok.
-            # m_indices show token use which expert, example, [0, 0, 0, 0, .... 1, 1, 1, 1,...., cur_expert_num - 1, ..]
-            # the count of 0 is num_recv_tokens_per_expert_list[0], the count of 1 is num_recv_tokens_per_expert_list[1]
-            # ...
-            m_indices = torch.empty(all_tokens, device=device, dtype=torch.int32)
-            # output_index shape [recive_num_tokens, topk_num]
-            # output_index use to show the token index in input_tensor
-            output_index = torch.empty_like(recv_topk_idx)
-
-            num_recv_tokens_per_expert = torch.tensor(
-                num_recv_tokens_per_expert_list, dtype=torch.int32, pin_memory=True, device="cpu"
-            ).cuda(non_blocking=True)
-
-            expert_start_loc = torch.empty_like(num_recv_tokens_per_expert)
-
-            ep_scatter(
-                recv_x[0],
-                recv_x[1],
-                recv_topk_idx,
-                num_recv_tokens_per_expert,
-                expert_start_loc,
-                input_tensor[0],
-                input_tensor[1],
-                m_indices,
-                output_index,
-            )
-            input_tensor[1] = tma_align_input_scale(input_tensor[1])
-            # groupgemm (contiguous layout)
-            gemm_out_a = torch.empty((all_tokens, N), device=device, dtype=hidden_dtype)
-
-            _deepgemm_grouped_fp8_nt_contiguous(input_tensor, (w1, w1_scale), gemm_out_a, m_indices)
-
-            # silu_and_mul_fwd + qaunt
-            # TODO fused kernel
-            silu_out = torch.empty((all_tokens, N // 2), device=device, dtype=hidden_dtype)
-
-            silu_and_mul_fwd(gemm_out_a.view(-1, N), silu_out)
-            qsilu_out, qsilu_out_scale = per_token_group_quant_fp8(
-                silu_out, self.block_size, dtype=w1.dtype, column_major_scales=True, scale_tma_aligned=True
-            )
-
-            # groupgemm (contiguous layout)
-            gemm_out_b = torch.empty((all_tokens, K), device=device, dtype=hidden_dtype)
-
-            _deepgemm_grouped_fp8_nt_contiguous((qsilu_out, qsilu_out_scale), (w2, w2_scale), gemm_out_b, m_indices)
-            # gather and local reduce
-            ep_gather(gemm_out_b, recv_topk_idx, recv_topk_weights, output_index, gather_out)
-        else:
-            ######################################## warning ##################################################
-            # here is used to match autotune feature, make moe model run same triton kernel in different rank.
-            # in some special case, one rank will recv 0 token, so add a token to make it run triton kernel.
-            if Autotuner.is_autotune_warmup():
-                _gemm_out_a = torch.zeros((1, N), device=device, dtype=hidden_dtype)
-                _silu_out = torch.zeros((1, N // 2), device=device, dtype=hidden_dtype)
-                silu_and_mul_fwd(_gemm_out_a.view(-1, N), _silu_out)
-                _gemm_out_a, _silu_out = None, None
-
-        return gather_out
-
-    def low_latency_combine(
-        self,
-        gemm_out_b: torch.Tensor,
-        topk_idx: torch.Tensor,
-        topk_weights: torch.Tensor,
-        handle: Any,
-    ):
-        combined_x, event_overlap, hook = dist_group_manager.ep_buffer.low_latency_combine(
-            gemm_out_b, topk_idx, topk_weights, handle, async_finish=False, return_recv_hook=True
-        )
-        return combined_x, hook
-
-    def combine(
-        self,
-        gemm_out_b: torch.Tensor,
-        handle: Any,
-        overlap_event: Optional[Any] = None,
-    ):
-        # normal combine
-        combined_x, _, event = dist_group_manager.ep_buffer.combine(
-            gemm_out_b,
-            handle,
-            topk_weights=None,
-            async_finish=True,
-            previous_event=overlap_event,
-            allocate_on_comm_stream=True,
-        )
-
-        def hook():
-            event.current_stream_wait()
-
-        return combined_x, hook
-
-    def _fuse(self):
-        if self.quantized_weight:
-            self._fuse_weight_scale()
-        with self.lock:
-            if (
-                hasattr(self, "experts_up_projs")
-                and None not in self.experts_up_projs
-                and None not in self.experts_gate_projs
-                and None not in self.w2_list
-            ):
-                gate_out_dim, gate_in_dim = self.experts_gate_projs[0].shape
-                up_out_dim, up_in_dim = self.experts_up_projs[0].shape
-                assert gate_in_dim == up_in_dim
-                dtype = self.experts_gate_projs[0].dtype
-                total_expert_num = self.ep_n_routed_experts + self.redundancy_expert_num
-
-                w1 = torch.empty((total_expert_num, gate_out_dim + up_out_dim, gate_in_dim), dtype=dtype, device="cpu")
-
-                for i_experts in range(self.ep_n_routed_experts + self.redundancy_expert_num):
-                    w1[i_experts, 0:gate_out_dim:, :] = self.experts_gate_projs[i_experts]
-                    w1[i_experts, gate_out_dim:, :] = self.experts_up_projs[i_experts]
-
-                inter_shape, hidden_size = self.w2_list[0].shape[0], self.w2_list[0].shape[1]
-                w2 = torch._utils._flatten_dense_tensors(self.w2_list).view(len(self.w2_list), inter_shape, hidden_size)
-                if not self.quantized_weight and self.quant_method is not None:
-                    qw1_pack = self.quant_method.quantize(w1)
-                    qw2_pack = self.quant_method.quantize(w2)
-                    self.w1[0] = qw1_pack.weight
-                    self.w1[1] = qw1_pack.weight_scale
-                    self.w2[0] = qw2_pack.weight
-                    self.w2[1] = qw2_pack.weight_scale
-                else:
-                    self.w1[0] = self._cuda(w1)
-                    self.w2[0] = self._cuda(w2)
-                delattr(self, "w2_list")
-                delattr(self, "experts_up_projs")
-                delattr(self, "experts_gate_projs")
-
-    def _fuse_weight_scale(self):
-        with self.lock:
-            if (
-                hasattr(self, "experts_up_proj_scales")
-                and None not in self.experts_up_proj_scales
-                and None not in self.experts_gate_proj_scales
-                and None not in self.w2_scale_list
-            ):
-                gate_out_dim, gate_in_dim = self.experts_gate_proj_scales[0].shape
-                up_out_dim, up_in_dim = self.experts_up_proj_scales[0].shape
-                assert gate_in_dim == up_in_dim
-                dtype = self.experts_gate_proj_scales[0].dtype
-                total_expert_num = self.ep_n_routed_experts + self.redundancy_expert_num
-
-                w1_scale = torch.empty(
-                    (total_expert_num, gate_out_dim + up_out_dim, gate_in_dim), dtype=dtype, device="cpu"
-                )
-
-                for i_experts in range(self.ep_n_routed_experts + self.redundancy_expert_num):
-                    w1_scale[i_experts, 0:gate_out_dim:, :] = self.experts_gate_proj_scales[i_experts]
-                    w1_scale[i_experts, gate_out_dim:, :] = self.experts_up_proj_scales[i_experts]
-
-                inter_shape, hidden_size = self.w2_scale_list[0].shape[0], self.w2_scale_list[0].shape[1]
-                w2_scale = torch._utils._flatten_dense_tensors(self.w2_scale_list).view(
-                    len(self.w2_scale_list), inter_shape, hidden_size
-                )
-                self.w1[1] = self._cuda(w1_scale)
-                self.w2[1] = self._cuda(w2_scale)
-                delattr(self, "w2_scale_list")
-                delattr(self, "experts_up_proj_scales")
-                delattr(self, "experts_gate_proj_scales")
-
-    def load_hf_weights(self, weights):
-        n_expert_ep = self.ep_n_routed_experts
-
-        # Load bias
-        if self.e_score_correction_bias_name in weights:
-            self.e_score_correction_bias = self._cuda(weights[self.e_score_correction_bias_name])
-
-        # Get weight shapes from first expert to determine intermediate size
-        first_expert_idx = 0 + n_expert_ep * self.global_rank_
-        w1_weight_name = f"{self.weight_prefix}.{first_expert_idx}.{self.w1_weight_name}.weight"
-        if w1_weight_name in weights:
-            intermediate_size = weights[w1_weight_name].shape[0]
-            self.intermediate_size = intermediate_size
-
-            # Re-create weights with correct size if needed
-            if self.w1[0].shape[1] != intermediate_size * 2:
-                self._create_weight()
-
-        # Load regular experts
-        for i_experts_ep in range(n_expert_ep):
-            i_experts = i_experts_ep + n_expert_ep * self.global_rank_
-            self._copy_expert_weights(i_experts_ep, i_experts, weights)
-
-        # Load redundant experts
-        for i, redundant_expert_id in enumerate(self.redundancy_expert_ids):
-            self._copy_expert_weights(n_expert_ep + i, redundant_expert_id, weights)
-
-        if self.quantized_weight:
-            self._load_weight_scale_direct(weights)
-
-    def _copy_expert_weights(self, target_idx, expert_id, weights):
-        """Copy a single expert's weights to pre-allocated GPU memory"""
-        w1_weight = f"{self.weight_prefix}.{expert_id}.{self.w1_weight_name}.weight"
-        w2_weight = f"{self.weight_prefix}.{expert_id}.{self.w2_weight_name}.weight"
-        w3_weight = f"{self.weight_prefix}.{expert_id}.{self.w3_weight_name}.weight"
-
-        intermediate_size = self.intermediate_size
-
-        if w1_weight in weights and w3_weight in weights:
-            # Combine gate and up projections into w1
-            gate_weight = weights[w1_weight]  # [intermediate_size, hidden_size]
-            up_weight = weights[w3_weight]  # [intermediate_size, hidden_size]
-
-            # Copy to pre-allocated memory
-            if not self.quantized_weight and self.quant_method is not None:
-                # Quantized path
-                combined_cpu = torch.empty((intermediate_size * 2, self.hidden_size), dtype=gate_weight.dtype)
-                combined_cpu[:intermediate_size, :] = gate_weight
-                combined_cpu[intermediate_size:, :] = up_weight
-                quantized_pack = self.quant_method.quantize(combined_cpu)
-                self.w1[0][target_idx].copy_(quantized_pack.weight.view(intermediate_size * 2, self.hidden_size))
-                if quantized_pack.weight_scale is not None:
-                    self.w1[1][target_idx].copy_(
-                        quantized_pack.weight_scale.view(intermediate_size * 2, self.hidden_size)
-                    )
-            else:
-                # Regular path
-                self.w1[0][target_idx, :intermediate_size, :].copy_(gate_weight)
-                self.w1[0][target_idx, intermediate_size:, :].copy_(up_weight)
-
-        if w2_weight in weights:
-            # Copy w2 (down projection)
-            w2_weight_tensor = weights[w2_weight]  # [hidden_size, intermediate_size] - already the correct shape
-            if not self.quantized_weight and self.quant_method is not None:
-                quantized_pack = self.quant_method.quantize(w2_weight_tensor)
-                self.w2[0][target_idx].copy_(quantized_pack.weight)
-                if quantized_pack.weight_scale is not None:
-                    self.w2[1][target_idx].copy_(quantized_pack.weight_scale)
-            else:
-                self.w2[0][target_idx].copy_(w2_weight_tensor)
-
-    def _load_weight_scale(self, weights: Dict[str, torch.Tensor]) -> None:
-        n_expert_ep = self.ep_n_routed_experts
-        for i_experts_ep in range(n_expert_ep):
-            i_experts = i_experts_ep + n_expert_ep * self.global_rank_
-            w1_scale = f"{self.weight_prefix}.{i_experts}.{self.w1_weight_name}.{self.weight_scale_suffix}"
-            w2_scale = f"{self.weight_prefix}.{i_experts}.{self.w2_weight_name}.{self.weight_scale_suffix}"
-            w3_scale = f"{self.weight_prefix}.{i_experts}.{self.w3_weight_name}.{self.weight_scale_suffix}"
-            if w1_scale in weights:
-                self.experts_gate_proj_scales[i_experts_ep] = weights[w1_scale]
-            if w3_scale in weights:
-                self.experts_up_proj_scales[i_experts_ep] = weights[w3_scale]
-
-            if w2_scale in weights:
-                self.w2_scale_list[i_experts_ep] = weights[w2_scale]
-
-        # Load scale parameters for redundant experts
-        for i, redundant_expert_id in enumerate(self.redundancy_expert_ids):
-            i_experts = redundant_expert_id
-            w1_scale = f"{self.weight_prefix}.{i_experts}.{self.w1_weight_name}.{self.weight_scale_suffix}"
-            w2_scale = f"{self.weight_prefix}.{i_experts}.{self.w2_weight_name}.{self.weight_scale_suffix}"
-            w3_scale = f"{self.weight_prefix}.{i_experts}.{self.w3_weight_name}.{self.weight_scale_suffix}"
-            if w1_scale in weights:
-                self.experts_gate_proj_scales[n_expert_ep + i] = weights[w1_scale]
-            if w3_scale in weights:
-                self.experts_up_proj_scales[n_expert_ep + i] = weights[w3_scale]
-            if w2_scale in weights:
-                self.w2_scale_list[n_expert_ep + i] = weights[w2_scale]
-
-    def _load_weight_scale_direct(self, weights: Dict[str, torch.Tensor]) -> None:
-        """Load weight scales directly to pre-allocated GPU memory"""
-        n_expert_ep = self.ep_n_routed_experts
-
-        # Load regular expert scales
-        for i_experts_ep in range(n_expert_ep):
-            i_experts = i_experts_ep + n_expert_ep * self.global_rank_
-            self._copy_expert_scales(i_experts_ep, i_experts, weights)
-
-        # Load redundant expert scales
-        for i, redundant_expert_id in enumerate(self.redundancy_expert_ids):
-            self._copy_expert_scales(n_expert_ep + i, redundant_expert_id, weights)
-
-    def _copy_expert_scales(self, target_idx, expert_id, weights):
-        """Copy a single expert's weight scales to pre-allocated GPU memory"""
-        w1_scale = f"{self.weight_prefix}.{expert_id}.{self.w1_weight_name}.{self.weight_scale_suffix}"
-        w2_scale = f"{self.weight_prefix}.{expert_id}.{self.w2_weight_name}.{self.weight_scale_suffix}"
-        w3_scale = f"{self.weight_prefix}.{expert_id}.{self.w3_weight_name}.{self.weight_scale_suffix}"
-
-        intermediate_size = self.intermediate_size
-
-        if w1_scale in weights and w3_scale in weights:
-            # Combine gate and up projection scales into w1 scale
-            gate_scale = weights[w1_scale]  # [intermediate_size, hidden_size]
-            up_scale = weights[w3_scale]  # [intermediate_size, hidden_size]
-
-            # Copy to pre-allocated memory
-            self.w1[1][target_idx, :intermediate_size, :].copy_(gate_scale)
-            self.w1[1][target_idx, intermediate_size:, :].copy_(up_scale)
-
-        if w2_scale in weights:
-            # Copy w2 scale (down projection)
-            w2_scale_tensor = weights[w2_scale]  # [hidden_size, intermediate_size]
-            self.w2[1][target_idx].copy_(w2_scale_tensor)
-
-    def _cuda(self, cpu_tensor):
-        if self.quantized_weight:
-            return cpu_tensor.contiguous().cuda(self.device_id_)
-        return cpu_tensor.contiguous().to(self.data_type_).cuda(self.device_id_)

From 45630282f88fc298ee3e10581517ffff0fce6228 Mon Sep 17 00:00:00 2001
From: shihaobai <1798930569@qq.com>
Date: Fri, 23 Jan 2026 10:24:38 +0000
Subject: [PATCH 28/65] add redundancy assert

---
 .../layer_weights/meta_weights/fused_moe/fused_moe_weight.py    | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/fused_moe_weight.py b/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/fused_moe_weight.py
index ced1e9267..6a1bd0ca4 100644
--- a/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/fused_moe_weight.py
+++ b/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/fused_moe_weight.py
@@ -86,6 +86,8 @@ def _init_redundancy_expert_params(self):
         self.auto_update_redundancy_expert: bool = get_env_start_args().auto_update_redundancy_expert
         self.redundancy_expert_ids_tensor = torch.tensor(self.redundancy_expert_ids, dtype=torch.int64, device="cuda")
         self.routed_expert_counter_tensor = torch.zeros((self.n_routed_experts,), dtype=torch.int64, device="cuda")
+        # TODO: find out the reason of failure of deepep when redundancy_expert_num is 1.
+        assert self.redundancy_expert_num != 1, "redundancy_expert_num can not be 1 for some unknown hang of deepep."
 
     def _init_parallel_params(self):
         self.local_n_routed_experts = self.n_routed_experts + self.num_fused_shared_experts

From 5798c723900273eddf1742f957144c3bd58abc70 Mon Sep 17 00:00:00 2001
From: shihaobai <1798930569@qq.com>
Date: Fri, 23 Jan 2026 10:52:03 +0000
Subject: [PATCH 29/65] fix mm weight with bias

---
 .../basemodel/layer_weights/meta_weights/mm_weight/mm_weight.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/mm_weight.py b/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/mm_weight.py
index 1133e4d6a..56aa322b4 100644
--- a/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/mm_weight.py
+++ b/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/mm_weight.py
@@ -116,7 +116,7 @@ def load_hf_weights(self, weights):
     def _create_weight(self):
         self.bias = None
         if self.bias_names is not None:
-            self.bias = torch.empty(self.cusum_out_dims[-1], dtype=self.data_type_).cuda(get_current_device_id())
+            self.bias = torch.empty(sum(self.out_dims), dtype=self.data_type_).cuda(get_current_device_id())
             self.bias._load_ok = [False] * len(self.bias_names)
         self.mm_param: WeightPack = self.quant_method.create_weight(
             in_dim=self.in_dim, out_dim=sum(self.out_dims), dtype=self.data_type_, device_id=get_current_device_id()

From 9a0db719ff468053d6e423cf524f94f44bf9f129 Mon Sep 17 00:00:00 2001
From: shihaobai <1798930569@qq.com>
Date: Mon, 26 Jan 2026 05:26:04 +0000
Subject: [PATCH 30/65] fix internvl

---
 .../pre_and_post_layer_weight.py              | 24 +++++++++----------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/lightllm/models/vit/layer_weights/pre_and_post_layer_weight.py b/lightllm/models/vit/layer_weights/pre_and_post_layer_weight.py
index 79dc9d95c..0d753aef8 100644
--- a/lightllm/models/vit/layer_weights/pre_and_post_layer_weight.py
+++ b/lightllm/models/vit/layer_weights/pre_and_post_layer_weight.py
@@ -4,6 +4,7 @@
 import torch.nn.functional as F
 from lightllm.common.basemodel import PreAndPostLayerWeight
 from lightllm.utils.dist_utils import get_current_device_id
+from lightllm.common.basemodel.layer_weights.meta_weights import LayerNormWeight
 
 
 class ViTPreAndPostLayerWeight(PreAndPostLayerWeight):
@@ -13,6 +14,8 @@ def __init__(self, data_type, network_config):
         self.image_size = self.network_config_["image_size"]
         self.patch_size = self.network_config_["patch_size"]
         self.llm_hidden_size = self.network_config_["llm_hidden_size"]
+        self.num_patches = (self.image_size // self.patch_size) ** 2
+        self.num_positions = self.num_patches + 1
         self._create_weight()
         return
 
@@ -24,18 +27,12 @@ def _create_weight(self):
 
         # Pre-allocate memory for vision model weights
         self.class_embedding = torch.empty((1, 1, split_embed_dim), dtype=self.data_type_).cuda()
-        self.position_embedding = torch.empty(
-            (1, 197, split_embed_dim), dtype=self.data_type_
-        ).cuda()  # 197 = (224//16)^2 + 1
+        self.position_embedding = torch.empty((1, self.num_positions, split_embed_dim), dtype=self.data_type_).cuda()
         self.patch_embedding_weight_ = torch.empty(
             (split_embed_dim, 3, self.patch_size, self.patch_size), dtype=self.data_type_
         ).cuda()
         self.patch_embedding_bias_ = torch.empty(split_embed_dim, dtype=self.data_type_).cuda()
 
-        # Pre-allocate memory for adapter weights
-        self.layernorm_weight_ = torch.empty(self.embed_dim, dtype=self.data_type_).cuda()
-        self.layernorm_bias_ = torch.empty(self.embed_dim, dtype=self.data_type_).cuda()
-
         split_indexes_llm = np.linspace(0, self.llm_hidden_size, self.tp_world_size_ + 1, dtype=np.int64)
         split_start_llm = split_indexes_llm[self.tp_rank_]
         split_end_llm = split_indexes_llm[self.tp_rank_ + 1]
@@ -45,6 +42,13 @@ def _create_weight(self):
         self.mlp1_1_bias_ = torch.empty(split_llm_hidden_size, dtype=self.data_type_).cuda()
         self.mlp1_3_weight_ = torch.empty((split_llm_hidden_size, self.llm_hidden_size), dtype=self.data_type_).cuda()
         self.mlp1_3_bias_ = torch.empty(self.llm_hidden_size, dtype=self.data_type_).cuda()
+
+        self.layernorm_weight_ = LayerNormWeight(
+            dim=self.embed_dim,
+            weight_name="mlp1.0.weight",
+            data_type=self.data_type_,
+            bias_name="mlp1.0.bias",
+        )
         return
 
     def _cuda(self, cpu_tensor):
@@ -68,6 +72,7 @@ def _get_pos_embed(self, H, W):
         return pos_embed
 
     def load_hf_weights(self, weights):
+        super().load_hf_weights(weights)
         split_indexes = np.linspace(0, self.embed_dim, self.tp_world_size_ + 1, dtype=np.int64)
         split_start = split_indexes[self.tp_rank_]
         split_end = split_indexes[self.tp_rank_ + 1]
@@ -86,11 +91,6 @@ def load_hf_weights(self, weights):
                 weights["vision_model.embeddings.patch_embedding.bias"][split_start:split_end]
             )
 
-        if "mlp1.0.weight" in weights:
-            self.layernorm_weight_.copy_(weights["mlp1.0.weight"])
-        if "mlp1.0.bias" in weights:
-            self.layernorm_bias_.copy_(weights["mlp1.0.bias"])
-
         split_indexes = np.linspace(0, self.llm_hidden_size, self.tp_world_size_ + 1, dtype=np.int64)
         split_start = split_indexes[self.tp_rank_]
         split_end = split_indexes[self.tp_rank_ + 1]

From 29d0f482658bea29851d5f38ad75aa182e009536 Mon Sep 17 00:00:00 2001
From: shihaobai <1798930569@qq.com>
Date: Mon, 26 Jan 2026 07:21:54 +0000
Subject: [PATCH 31/65] fix unitest

---
 unit_tests/common/fused_moe/test_grouped_fused_moe.py     | 7 ++++++-
 .../common/fused_moe/test_grouped_fused_moe_speed.py      | 2 +-
 unit_tests/common/fused_moe/test_grouped_topk.py          | 4 ++--
 .../fused_moe/test_moe_silu_and_mul_mix_quant_ep.py       | 8 +++++---
 unit_tests/common/fused_moe/test_softmax_topk.py          | 2 +-
 5 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/unit_tests/common/fused_moe/test_grouped_fused_moe.py b/unit_tests/common/fused_moe/test_grouped_fused_moe.py
index 9a613f6f7..9c08cfc1a 100644
--- a/unit_tests/common/fused_moe/test_grouped_fused_moe.py
+++ b/unit_tests/common/fused_moe/test_grouped_fused_moe.py
@@ -2,7 +2,12 @@
 import time
 import pytest
 import triton
-from lightllm.common.fused_moe.grouped_fused_moe import moe_align, moe_align1, moe_align2, grouped_matmul
+from lightllm.common.basemodel.triton_kernel.fused_moe.grouped_fused_moe import (
+    moe_align,
+    moe_align1,
+    moe_align2,
+    grouped_matmul,
+)
 from lightllm.utils.log_utils import init_logger
 
 logger = init_logger(__name__)
diff --git a/unit_tests/common/fused_moe/test_grouped_fused_moe_speed.py b/unit_tests/common/fused_moe/test_grouped_fused_moe_speed.py
index 03beccdf9..769002517 100644
--- a/unit_tests/common/fused_moe/test_grouped_fused_moe_speed.py
+++ b/unit_tests/common/fused_moe/test_grouped_fused_moe_speed.py
@@ -1,7 +1,7 @@
 import torch
 import time
 import pytest
-from lightllm.common.fused_moe.grouped_fused_moe import moe_align, moe_align1, grouped_matmul
+from lightllm.common.basemodel.triton_kernel.fused_moe.grouped_fused_moe import moe_align, moe_align1, grouped_matmul
 from lightllm.utils.log_utils import init_logger
 
 seed = 42
diff --git a/unit_tests/common/fused_moe/test_grouped_topk.py b/unit_tests/common/fused_moe/test_grouped_topk.py
index 37c3fabc7..432e13316 100755
--- a/unit_tests/common/fused_moe/test_grouped_topk.py
+++ b/unit_tests/common/fused_moe/test_grouped_topk.py
@@ -2,8 +2,8 @@
 import time
 import pytest
 import numpy as np
-from lightllm.common.fused_moe.grouped_topk import triton_grouped_topk
-from lightllm.common.fused_moe.topk_select import biased_grouped_topk as grouped_topk
+from lightllm.common.basemodel.triton_kernel.fused_moe.grouped_topk import triton_grouped_topk
+from lightllm.common.basemodel.triton_kernel.fused_moe.topk_select import biased_grouped_topk as grouped_topk
 from lightllm.utils.log_utils import init_logger
 
 logger = init_logger(__name__)
diff --git a/unit_tests/common/fused_moe/test_moe_silu_and_mul_mix_quant_ep.py b/unit_tests/common/fused_moe/test_moe_silu_and_mul_mix_quant_ep.py
index ab2cc4976..29aed2a70 100644
--- a/unit_tests/common/fused_moe/test_moe_silu_and_mul_mix_quant_ep.py
+++ b/unit_tests/common/fused_moe/test_moe_silu_and_mul_mix_quant_ep.py
@@ -14,8 +14,10 @@ def is_fp8_native_supported():
     pytest.skip(reason="not support fp8 test in this gpu card", allow_module_level=True)
 
 import random
-from lightllm.common.fused_moe.moe_silu_and_mul_mix_quant_ep import silu_and_mul_masked_post_quant_fwd
-from lightllm.common.fused_moe.moe_silu_and_mul import silu_and_mul_fwd
+from lightllm.common.basemodel.triton_kernel.fused_moe.moe_silu_and_mul_mix_quant_ep import (
+    silu_and_mul_masked_post_quant_fwd,
+)
+from lightllm.common.basemodel.triton_kernel.fused_moe.moe_silu_and_mul import silu_and_mul_fwd
 from lightllm.common.basemodel.triton_kernel.quantization.fp8act_quant_kernel import per_token_group_quant_fp8
 from lightllm.utils.log_utils import init_logger
 
@@ -37,7 +39,7 @@ def is_fp8_native_supported():
 )
 def test_silu_and_mul_masked(expert_num, token_num, hidden_dim):
     quant_group_size = 128
-    in_tensor = torch.randn((expert_num, token_num, hidden_dim), dtype=torch.float16, device="cuda")
+    in_tensor = torch.randn((expert_num, token_num, hidden_dim), dtype=torch.bfloat16, device="cuda")
     out_tensor = torch.empty((expert_num, token_num, hidden_dim // 2), dtype=torch.float8_e4m3fn, device="cuda")
     out_scale_tensor = torch.randn(
         (expert_num, token_num, hidden_dim // 2 // quant_group_size), dtype=torch.float32, device="cuda"
diff --git a/unit_tests/common/fused_moe/test_softmax_topk.py b/unit_tests/common/fused_moe/test_softmax_topk.py
index 6252dfa8c..7c3e483df 100755
--- a/unit_tests/common/fused_moe/test_softmax_topk.py
+++ b/unit_tests/common/fused_moe/test_softmax_topk.py
@@ -2,7 +2,7 @@
 import time
 import pytest
 import numpy as np
-from lightllm.common.fused_moe.softmax_topk import softmax_topk
+from lightllm.common.basemodel.triton_kernel.fused_moe.softmax_topk import softmax_topk
 from lightllm.utils.log_utils import init_logger
 
 logger = init_logger(__name__)

From 4734f3ef8ba20d774d32f0abb21028236e597700 Mon Sep 17 00:00:00 2001
From: shihaobai <1798930569@qq.com>
Date: Mon, 26 Jan 2026 07:38:31 +0000
Subject: [PATCH 32/65] lmhead fix

---
 .../meta_weights/embedding_weight.py          | 21 +++----------------
 1 file changed, 3 insertions(+), 18 deletions(-)

diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/embedding_weight.py b/lightllm/common/basemodel/layer_weights/meta_weights/embedding_weight.py
index d4e03d0a1..d94a4c709 100644
--- a/lightllm/common/basemodel/layer_weights/meta_weights/embedding_weight.py
+++ b/lightllm/common/basemodel/layer_weights/meta_weights/embedding_weight.py
@@ -12,8 +12,6 @@ def __init__(self, dim: int, vocab_size: int, weight_name: str, data_type: torch
         super().__init__()
         self.dim = dim
         self.vocab_size = vocab_size
-        self.tp_world_size_ = get_dp_world_size()
-        self.tp_rank_ = get_current_rank_in_dp()
         # 计算 split_indexes
         split_indexes = np.linspace(0, self.vocab_size, self.tp_world_size_ + 1, dtype=np.int64)
         self.tp_vocab_start_id = int(split_indexes[self.tp_rank_])
@@ -86,7 +84,7 @@ def __call__(
         return self._forward(input_ids=input_ids, out=out, alloc_func=alloc_func)
 
 
-class LMHeadWeight(BaseWeightTpl, PlatformAwareOp):
+class LMHeadWeight(EmbeddingWeight):
     def __init__(
         self,
         dim: int,
@@ -95,27 +93,14 @@ def __init__(
         data_type: torch.dtype,
         embedding_weight: Optional[EmbeddingWeight] = None,
     ):
-        super().__init__()
-        self.dim = dim
-        self.vocab_size = vocab_size
-        self.tp_world_size_ = get_dp_world_size()
-        self.tp_rank_ = get_current_rank_in_dp()
-        # 计算 split_indexes
-        split_indexes = np.linspace(0, self.vocab_size, self.tp_world_size_ + 1, dtype=np.int64)
-        self.tp_vocab_start_id = int(split_indexes[self.tp_rank_])
-        self.tp_vocab_end_id = int(split_indexes[self.tp_rank_ + 1])
-        self.weight_name: str = weight_name
-        self.data_type_ = data_type
         self._embedding_weight = embedding_weight
-        self._create_weight()
+        super().__init__(dim=dim, vocab_size=vocab_size, weight_name=weight_name, data_type=data_type)
 
     def _create_weight(self):
         if self._embedding_weight is not None:
             self.weight = self._embedding_weight.weight
             return
-        tp_vocab_size = self.tp_vocab_end_id - self.tp_vocab_start_id
-        self.weight: torch.Tensor = torch.empty(tp_vocab_size, self.dim, dtype=self.data_type_, device=self.device_id_)
-        self.weight.load_ok = False
+        super()._create_weight()
 
     def load_hf_weights(self, weights: Dict[str, torch.Tensor]):
         # When set tile_embedding=True, no need to load - EmbeddingWeight already loaded it

From ea486b405e1c00ff9bf5620e1845ee8f4cc57ca4 Mon Sep 17 00:00:00 2001
From: shihaobai <1798930569@qq.com>
Date: Mon, 26 Jan 2026 07:47:28 +0000
Subject: [PATCH 33/65] remove cnt

---
 .../common/basemodel/layer_weights/meta_weights/norm_weight.py   | 1 -
 1 file changed, 1 deletion(-)

diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/norm_weight.py b/lightllm/common/basemodel/layer_weights/meta_weights/norm_weight.py
index d4717386b..5fd841753 100644
--- a/lightllm/common/basemodel/layer_weights/meta_weights/norm_weight.py
+++ b/lightllm/common/basemodel/layer_weights/meta_weights/norm_weight.py
@@ -189,7 +189,6 @@ def load_hf_weights(self, weights: Dict[str, torch.Tensor]):
         if self.weight_name in weights:
             self.weight.copy_(weights[self.weight_name])
             self.weight += 1
-            self.load_cnt += 1
 
 
 class QKRMSNORMWeight(RMSNormWeight):

From 3a0009b14f51c0e57512eb5120ea9b2cae3cd97c Mon Sep 17 00:00:00 2001
From: wangzaijun <wangzaijun@sensetime.com>
Date: Mon, 26 Jan 2026 08:21:36 +0000
Subject: [PATCH 34/65] remove rmsnorm bias_name input.

---
 .../layer_weights/meta_weights/norm_weight.py     | 15 +++++++--------
 .../layer_weights/pre_and_post_layer_weight.py    |  3 ---
 .../layer_weights/transformer_layer_weight.py     |  2 --
 .../layer_weights/pre_and_post_layer_weight.py    |  1 -
 .../layer_weights/transformer_layer_weight.py     |  1 -
 .../layer_weights/pre_and_post_layer_weight.py    |  2 --
 .../layer_weights/transformer_layer_weight.py     |  1 -
 .../vit/layer_weights/transformer_layer_weight.py |  2 --
 8 files changed, 7 insertions(+), 20 deletions(-)

diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/norm_weight.py b/lightllm/common/basemodel/layer_weights/meta_weights/norm_weight.py
index 5fd841753..0ce6ba2f1 100644
--- a/lightllm/common/basemodel/layer_weights/meta_weights/norm_weight.py
+++ b/lightllm/common/basemodel/layer_weights/meta_weights/norm_weight.py
@@ -9,12 +9,11 @@
 
 
 class RMSNormWeight(BaseWeightTpl, PlatformAwareOp):
-    def __init__(self, dim: int, weight_name: str, data_type: torch.dtype, bias_name: str = None):
+    def __init__(self, dim: int, weight_name: str, data_type: torch.dtype):
         super().__init__()
         self.dim = dim
         self.weight_name = weight_name
         self.data_type_ = data_type
-        assert bias_name is None, "RMSNormWeight does not have bias"
         self._create_weight()
 
     def _create_weight(self):
@@ -138,8 +137,8 @@ def __call__(
 
 
 class TpRMSNormWeight(RMSNormWeight):
-    def __init__(self, dim: int, weight_name: str, data_type: torch.dtype, bias_name: str = None):
-        super().__init__(dim=dim, weight_name=weight_name, data_type=data_type, bias_name=bias_name)
+    def __init__(self, dim: int, weight_name: str, data_type: torch.dtype):
+        super().__init__(dim=dim, weight_name=weight_name, data_type=data_type)
         self.tp_world_size_ = get_dp_world_size()
         self.tp_rank_ = get_current_rank_in_dp()
         self.dim = self._get_tp_padded_dim(dim=dim)
@@ -180,8 +179,8 @@ def load_hf_weights(self, weights):
 
 
 class NoTpGEMMANormWeight(RMSNormWeight):
-    def __init__(self, dim: int, weight_name: str, data_type: torch.dtype, bias_name: str = None):
-        super().__init__(dim=dim, weight_name=weight_name, data_type=data_type, bias_name=bias_name)
+    def __init__(self, dim: int, weight_name: str, data_type: torch.dtype):
+        super().__init__(dim=dim, weight_name=weight_name, data_type=data_type)
         self.tp_world_size_ = 1
         self.tp_rank_ = 0
 
@@ -192,8 +191,8 @@ def load_hf_weights(self, weights: Dict[str, torch.Tensor]):
 
 
 class QKRMSNORMWeight(RMSNormWeight):
-    def __init__(self, dim: int, weight_name: str, data_type: torch.dtype, bias_name: str = None):
-        super().__init__(dim=dim, weight_name=weight_name, data_type=data_type, bias_name=bias_name)
+    def __init__(self, dim: int, weight_name: str, data_type: torch.dtype):
+        super().__init__(dim=dim, weight_name=weight_name, data_type=data_type)
         self.tp_world_size_ = 1
         self.tp_rank_ = 0
 
diff --git a/lightllm/models/deepseek_mtp/layer_weights/pre_and_post_layer_weight.py b/lightllm/models/deepseek_mtp/layer_weights/pre_and_post_layer_weight.py
index 719c80c27..1df695df0 100644
--- a/lightllm/models/deepseek_mtp/layer_weights/pre_and_post_layer_weight.py
+++ b/lightllm/models/deepseek_mtp/layer_weights/pre_and_post_layer_weight.py
@@ -23,19 +23,16 @@ def __init__(self, data_type, network_config):
             dim=hidden_size,
             weight_name="model.layers.0.enorm.weight",
             data_type=self.data_type_,
-            bias_name=None,
         )
         self.hnorm_weight_ = RMSNormWeight(
             dim=hidden_size,
             weight_name="model.layers.0.hnorm.weight",
             data_type=self.data_type_,
-            bias_name=None,
         )
         self.final_norm_weight_ = RMSNormWeight(
             dim=hidden_size,
             weight_name="model.layers.0.shared_head.norm.weight",
             data_type=self.data_type_,
-            bias_name=None,
         )
 
         # 与DeepseekV3模型共享, 不通过 load 加载
diff --git a/lightllm/models/llama/layer_weights/transformer_layer_weight.py b/lightllm/models/llama/layer_weights/transformer_layer_weight.py
index b68903ecd..0566c9f1c 100644
--- a/lightllm/models/llama/layer_weights/transformer_layer_weight.py
+++ b/lightllm/models/llama/layer_weights/transformer_layer_weight.py
@@ -116,11 +116,9 @@ def _init_norm(self):
             dim=hidden_size,
             weight_name=self._att_norm_weight_name,
             data_type=self.data_type_,
-            bias_name=self._att_norm_bias_name,
         )
         self.ffn_norm_weight_ = RMSNormWeight(
             dim=hidden_size,
             weight_name=self._ffn_norm_weight_name,
             data_type=self.data_type_,
-            bias_name=self._ffn_norm_bias_name,
         )
diff --git a/lightllm/models/mistral_mtp/layer_weights/pre_and_post_layer_weight.py b/lightllm/models/mistral_mtp/layer_weights/pre_and_post_layer_weight.py
index a65250b16..0b6dcf137 100644
--- a/lightllm/models/mistral_mtp/layer_weights/pre_and_post_layer_weight.py
+++ b/lightllm/models/mistral_mtp/layer_weights/pre_and_post_layer_weight.py
@@ -28,7 +28,6 @@ def __init__(self, data_type, network_config):
             dim=hidden_size,
             weight_name="mtp.hnorm.weight",
             data_type=self.data_type_,
-            bias_name=None,
         )
 
         self.wte_weight_: EmbeddingWeight = None
diff --git a/lightllm/models/mistral_mtp/layer_weights/transformer_layer_weight.py b/lightllm/models/mistral_mtp/layer_weights/transformer_layer_weight.py
index 2cbc6cf58..8d3f94f8f 100644
--- a/lightllm/models/mistral_mtp/layer_weights/transformer_layer_weight.py
+++ b/lightllm/models/mistral_mtp/layer_weights/transformer_layer_weight.py
@@ -50,5 +50,4 @@ def _init_norm(self):
             dim=hidden_size,
             weight_name=self._ffn_norm_weight_name,
             data_type=self.data_type_,
-            bias_name=self._ffn_norm_bias_name,
         )
diff --git a/lightllm/models/qwen3_moe_mtp/layer_weights/pre_and_post_layer_weight.py b/lightllm/models/qwen3_moe_mtp/layer_weights/pre_and_post_layer_weight.py
index e3a557d55..924f01c46 100644
--- a/lightllm/models/qwen3_moe_mtp/layer_weights/pre_and_post_layer_weight.py
+++ b/lightllm/models/qwen3_moe_mtp/layer_weights/pre_and_post_layer_weight.py
@@ -24,13 +24,11 @@ def __init__(self, data_type, network_config):
             dim=hidden_size,
             weight_name="model.layers.0.norm_after_embedding.weight",
             data_type=self.data_type_,
-            bias_name=None,
         )
         self.hnorm_weight_ = RMSNormWeight(
             dim=hidden_size,
             weight_name="model.layers.0.norm_before_output.weight",
             data_type=self.data_type_,
-            bias_name=None,
         )
         # 与Qwen3MOE模型共享
         self.wte_weight_: EmbeddingWeight = None
diff --git a/lightllm/models/qwen3_moe_mtp/layer_weights/transformer_layer_weight.py b/lightllm/models/qwen3_moe_mtp/layer_weights/transformer_layer_weight.py
index 2a11724ce..12bb96980 100644
--- a/lightllm/models/qwen3_moe_mtp/layer_weights/transformer_layer_weight.py
+++ b/lightllm/models/qwen3_moe_mtp/layer_weights/transformer_layer_weight.py
@@ -21,5 +21,4 @@ def _init_norm(self):
             dim=hidden_size,
             weight_name=self._ffn_norm_weight_name,
             data_type=self.data_type_,
-            bias_name=self._ffn_norm_bias_name,
         )
diff --git a/lightllm/models/vit/layer_weights/transformer_layer_weight.py b/lightllm/models/vit/layer_weights/transformer_layer_weight.py
index 54ad36786..03ce2a7a3 100644
--- a/lightllm/models/vit/layer_weights/transformer_layer_weight.py
+++ b/lightllm/models/vit/layer_weights/transformer_layer_weight.py
@@ -142,13 +142,11 @@ def _init_norm(self):
                 dim=hidden_size,
                 weight_name=self._q_norm_weight_name,
                 data_type=self.data_type_,
-                bias_name=None,
             )
             self.k_norm_weight_ = TpRMSNormWeight(
                 dim=hidden_size,
                 weight_name=self._k_norm_weight_name,
                 data_type=self.data_type_,
-                bias_name=None,
             )
 
     def load_hf_weights(self, weights):

From b917009609871b854c4805549dcb18c298d50967 Mon Sep 17 00:00:00 2001
From: wangzaijun <wangzaijun@sensetime.com>
Date: Mon, 26 Jan 2026 08:48:05 +0000
Subject: [PATCH 35/65] fix att sink

---
 .../meta_weights/att_sink_weight.py           | 47 ++++++++++++++++---
 .../layer_weights/transformer_layer_weight.py |  2 +
 2 files changed, 43 insertions(+), 6 deletions(-)

diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/att_sink_weight.py b/lightllm/common/basemodel/layer_weights/meta_weights/att_sink_weight.py
index 32d59e66e..f3952992c 100644
--- a/lightllm/common/basemodel/layer_weights/meta_weights/att_sink_weight.py
+++ b/lightllm/common/basemodel/layer_weights/meta_weights/att_sink_weight.py
@@ -1,21 +1,56 @@
 import torch
-from typing import Dict
+from typing import Dict, Tuple
 from .base_weight import BaseWeightTpl
 from lightllm.utils.dist_utils import get_current_device_id
 
 
 class TpAttSinkWeight(BaseWeightTpl):
-    def __init__(self, weight_name: str, data_type):
+    def __init__(self, all_kv_head_num: int, head_dim: int, weight_name: str, data_type):
         super().__init__()
+        self.all_kv_head_num = all_kv_head_num
+        self.head_dim = head_dim
         self.weight_name = weight_name
         self.data_type_ = data_type
-        self.weight: torch.Tensor = None
-        # TODO: add create weight function
+        self._start_head_index, self._end_head_index = self._get_head_tp_split_params(all_head_num=self.all_kv_head_num)
+        self._create_weight()
+
+    def _create_weight(self):
+        self.weight = torch.empty((self.all_kv_head_num, self.head_dim), dtype=self.data_type_, device="cuda")
+        self.weight.load_ok = False
 
     def load_hf_weights(self, weights: Dict[str, torch.Tensor]):
         if self.weight_name not in weights or self.weight is not None:
             return
 
         t_weight = weights[self.weight_name]
-        start_head_index, end_head_index = self._get_head_tp_split_params(weight=t_weight)
-        self.weight = t_weight[start_head_index:end_head_index].to(self.data_type_).cuda(get_current_device_id())
+        self.weight = (
+            t_weight[self._start_head_index : self._end_head_index].to(self.data_type_).cuda(get_current_device_id())
+        )
+        self.weight.load_ok = True
+
+    def verify_load(self):
+        return self.weight.load_ok
+
+    def _get_head_tp_split_params(self, all_head_num: int) -> Tuple[int, int]:
+        """
+        Docstring for _get_head_tp_split_params,
+        一个常用的tp 划分head获取head_index 范围的功能函数, 一些继承类可能会使用。
+        :param self: Description
+        :param weight: Description
+        :type weight: torch.Tensor
+        :return: Description
+        :rtype: Tuple[int, int]
+        """
+        tp_head_num = all_head_num // self.tp_world_size_
+
+        if tp_head_num > 0:
+            start_head_index = self.tp_rank_ * tp_head_num
+            end_head_index = (self.tp_rank_ + 1) * tp_head_num
+        else:
+            # 当 tp_world_size 大于 all_head_num 时的特殊处理
+            scale_size = self.tp_world_size_ // all_head_num
+            assert self.tp_world_size_ % all_head_num == 0
+            start_head_index = self.tp_rank_ // scale_size
+            end_head_index = start_head_index + 1
+
+        return start_head_index, end_head_index
diff --git a/lightllm/models/gpt_oss/layer_weights/transformer_layer_weight.py b/lightllm/models/gpt_oss/layer_weights/transformer_layer_weight.py
index e6d58c3b2..abf2b07e9 100644
--- a/lightllm/models/gpt_oss/layer_weights/transformer_layer_weight.py
+++ b/lightllm/models/gpt_oss/layer_weights/transformer_layer_weight.py
@@ -72,6 +72,8 @@ def _init_weight(self):
         super()._init_weight()
 
         self.attn_sinks = TpAttSinkWeight(
+            all_kv_head_num=self.q_head_num_ + self.k_head_num_,
+            head_dim=self.head_dim,
             weight_name=f"model.layers.{self.layer_num_}.self_attn.sinks",
             data_type=torch.bfloat16,
         )

From b1b38e2b0383e04a93bce11afbbf6950b4a44ac7 Mon Sep 17 00:00:00 2001
From: shihaobai <1798930569@qq.com>
Date: Mon, 26 Jan 2026 10:46:28 +0000
Subject: [PATCH 36/65] add cpu weight_buffer

---
 .../fused_moe/fused_moe_weight.py             | 57 ++++++++++++-------
 .../meta_weights/mm_weight/mm_weight.py       | 30 +++++-----
 lightllm/common/quantization/awq.py           | 41 +++++--------
 lightllm/common/quantization/deepgemm.py      |  8 ---
 lightllm/common/quantization/no_quant.py      |  4 +-
 .../common/quantization/quantize_method.py    | 55 ++++++++++++------
 lightllm/common/quantization/w8a8.py          | 26 +++++----
 7 files changed, 121 insertions(+), 100 deletions(-)

diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/fused_moe_weight.py b/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/fused_moe_weight.py
index 6a1bd0ca4..a492d82ef 100644
--- a/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/fused_moe_weight.py
+++ b/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/fused_moe_weight.py
@@ -1,11 +1,12 @@
 import torch
 import threading
-from typing import Dict, Any, Optional, Tuple
+from typing import Dict, Any, Optional, Tuple, List
 from lightllm.common.basemodel.layer_weights.meta_weights.base_weight import BaseWeightTpl
 from lightllm.common.quantization.quantize_method import WeightPack
 from lightllm.common.basemodel.layer_weights.meta_weights.mm_weight.mm_slicer import (
     get_row_slice_mixin,
     get_col_slice_mixin,
+    SliceMixinTpl,
 )
 from lightllm.common.basemodel.layer_weights.meta_weights.fused_moe.impl import select_fuse_moe_impl
 from lightllm.common.quantization.quantize_method import QuantizationMethod
@@ -285,6 +286,7 @@ def _create_weight(self):
             device_id=self.device_id_,
             num_experts=self.local_n_routed_experts,
         )
+        self.w13_list: List[WeightPack] = self._get_expert_weight_list(self.w13, 2)
         self.w2: WeightPack = self.quant_method.create_weight(
             out_dim=self.hidden_size,
             in_dim=intermediate_size,
@@ -292,8 +294,17 @@ def _create_weight(self):
             device_id=self.device_id_,
             num_experts=self.local_n_routed_experts,
         )
+        self.w2_list: List[WeightPack] = self._get_expert_weight_list(self.w2, 1)
         self.load_cnt = 0
 
+    def _get_expert_weight_list(self, weight_pack: WeightPack, weight_num: int = 1):
+        weight_list = []
+        for idx in range(self.local_n_routed_experts):
+            expert_weight = weight_pack.get_expert(idx)
+            expert_weight.create_cpu_buffer(weight_num)
+            weight_list.append(expert_weight)
+        return weight_list
+
     def _load_weight(self, expert_idx_to_local_idx: Dict[int, int], weights: Dict[str, torch.Tensor]):
 
         # Load each expert with TP slicing
@@ -332,32 +343,38 @@ def _load_expert(
         w1_weight = f"{self.weight_prefix}.{expert_idx}.{self.w1_weight_name}.{suffix}"
         w2_weight = f"{self.weight_prefix}.{expert_idx}.{self.w2_weight_name}.{suffix}"
         w3_weight = f"{self.weight_prefix}.{expert_idx}.{self.w3_weight_name}.{suffix}"
-        intermediate_size = self.split_inter_size
-        load_func, slice_func = self._get_load_and_slice_func(type, is_row=True)
+        load_func = self._get_load_func(type)
+        row_slice_func = self._get_slice_func(self.row_slicer, type)
+        col_slice_func = self._get_slice_func(self.col_slicer, type)
         if w1_weight in weights:
-            load_func(slice_func(weights[w1_weight]), self.w13.get_expert(local_expert_idx), start_idx=0)
+            self.w13_list[local_expert_idx].weight_cpu_buffer[0] = row_slice_func(weights[w1_weight])
         if w3_weight in weights:
-            load_func(
-                slice_func(weights[w3_weight]), self.w13.get_expert(local_expert_idx), start_idx=intermediate_size
-            )
-        load_func, slice_func = self._get_load_and_slice_func(type, is_row=False)
+            self.w13_list[local_expert_idx].weight_cpu_buffer[1] = row_slice_func(weights[w3_weight])
+        w13_weight = self.w13_list[local_expert_idx].get_fused_weight_part(suffix)
+        load_func(w13_weight, self.w13_list[local_expert_idx])
         if w2_weight in weights:
-            load_func(slice_func(weights[w2_weight]), self.w2.get_expert(local_expert_idx), start_idx=0)
+            self.w2_list[local_expert_idx].weight_cpu_buffer[0] = col_slice_func(weights[w2_weight])
+        w2_weight = self.w2_list[local_expert_idx].get_fused_weight_part(suffix)
+        load_func(w2_weight, self.w2_list[local_expert_idx])
 
-    def _load_weight_func(self, weight: torch.Tensor, weight_pack: WeightPack, start_idx: int = 0):
+    def _load_weight_func(self, weight: torch.Tensor, weight_pack: WeightPack):
         if self.quant_method.weight_need_quanted(weight):
-            self.quant_method.quantize(weight, weight_pack, start_idx)
+            self.quant_method.quantize(weight, weight_pack)
         else:
-            self.quant_method.load_weight(weight, weight_pack, start_idx)
+            self.quant_method.load_weight(weight, weight_pack)
 
-    def _get_load_and_slice_func(self, type: str, is_row: bool = True):
-        if is_row:
-            slicer = self.row_slicer
-        else:
-            slicer = self.col_slicer
+    def _get_load_func(self, type: str):
+        if type == "weight":
+            return self._load_weight_func
+        elif type == "weight_scale":
+            return getattr(self.quant_method, "load_weight_scale")
+        elif type == "weight_zero_point":
+            return getattr(self.quant_method, "load_weight_zero_point")
+
+    def _get_slice_func(self, slicer: SliceMixinTpl, type: str):
         if type == "weight":
-            return self._load_weight_func, slicer._slice_weight
+            return slicer._slice_weight
         elif type == "weight_scale":
-            return getattr(self.quant_method, "load_weight_scale"), slicer._slice_weight_scale
+            return slicer._slice_weight_scale
         elif type == "weight_zero_point":
-            return getattr(self.quant_method, "load_weight_zero_point"), slicer._slice_weight_zero_point
+            return slicer._slice_weight_zero_point
diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/mm_weight.py b/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/mm_weight.py
index 56aa322b4..de3b36669 100644
--- a/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/mm_weight.py
+++ b/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/mm_weight.py
@@ -121,7 +121,9 @@ def _create_weight(self):
         self.mm_param: WeightPack = self.quant_method.create_weight(
             in_dim=self.in_dim, out_dim=sum(self.out_dims), dtype=self.data_type_, device_id=get_current_device_id()
         )
-        self.mm_param.initialize_load_status(len(self.weight_names))
+        # For fused weights such as gate_up_proj, we first load them into a CPU buffer
+        # for online quantization (e.g., per-tensor quantization).
+        self.mm_param.create_cpu_buffer(len(self.weight_names))
         return
 
     # 执行顺序
@@ -130,16 +132,12 @@ def _load_weight(
     ) -> None:
         if param_name in weights:
             weight = self.param_slicer._slice_weight(weights[param_name])
-            start_idx = self.cusum_out_dims[sub_child_index]
+            self.mm_param.weight_cpu_buffer[sub_child_index] = weight
+            weight = self.mm_param.get_fused_weight_part("weight")
             if self.quant_method.weight_need_quanted(weight):
-                self.quant_method.quantize(weight, self.mm_param, offset=start_idx)
-                # weight_scale and zero_point will be computed during online quantization.
-                # so we set them to True here.
-                self.mm_param.load_ok[sub_child_index][1] = True
-                self.mm_param.load_ok[sub_child_index][2] = True
+                self.quant_method.quantize(weight, self.mm_param)
             else:
-                self.quant_method.load_weight(weight, self.mm_param, start_idx)
-            self.mm_param.load_ok[sub_child_index][0] = True
+                self.quant_method.load_weight(weight, self.mm_param)
         return
 
     def _load_bias(
@@ -158,9 +156,9 @@ def _load_weight_scale(
     ) -> None:
         if param_name in weights:
             weight_scale = self.param_slicer._slice_weight_scale(weights[param_name])
-            start_idx = self.cusum_out_dims[sub_child_index]
-            self.quant_method.load_weight_scale(weight_scale, self.mm_param, start_idx)
-            self.mm_param.load_ok[sub_child_index][1] = True
+            self.mm_param.weight_scale_cpu_buffer[sub_child_index] = weight_scale
+            weight_scale = self.mm_param.get_fused_weight_part("weight_scale")
+            self.quant_method.load_weight_scale(weight_scale, self.mm_param)
         return
 
     def _load_weight_zero_point(
@@ -168,13 +166,13 @@ def _load_weight_zero_point(
     ) -> None:
         if param_name in weights:
             weight_zero_point = self.param_slicer._slice_weight_zero_point(weights[param_name])
-            start_idx = self.cusum_out_dims[sub_child_index]
-            self.quant_method.load_weight_zero_point(weight_zero_point, self.mm_param, start_idx)
-            self.mm_param.load_ok[sub_child_index][2] = True
+            self.mm_param.weight_zero_point_cpu_buffer[sub_child_index] = weight_zero_point
+            weight_zero_point = self.mm_param.get_fused_weight_part("weight_zero_point")
+            self.quant_method.load_weight_zero_point(weight_zero_point, self.mm_param)
         return
 
     def verify_load(self):
-        mm_param_load_ok = all(all(load_ok_list) for load_ok_list in self.mm_param.load_ok)
+        mm_param_load_ok = all(self.mm_param.load_ok)
         bias_load_ok = True if self.bias is None else all(self.bias._load_ok)
         if not (mm_param_load_ok and bias_load_ok):
             logger.warning(f"mm_param_load_ok: {self.mm_param.load_ok}, bias_load_ok: {self.bias}")
diff --git a/lightllm/common/quantization/awq.py b/lightllm/common/quantization/awq.py
index ddb7674dd..bbf759757 100644
--- a/lightllm/common/quantization/awq.py
+++ b/lightllm/common/quantization/awq.py
@@ -118,22 +118,7 @@ def create_weight(
         weight_zero_point = torch.empty(
             expert_prefix + (in_dim // group_size, out_dim // self.pack_factor), dtype=torch.int32
         ).cuda(device_id)
-        return WeightPack(weight=weight, weight_scale=weight_scale, weight_zero_point=weight_zero_point)
-
-    def load_weight(self, weight: torch.Tensor, weight_pack: WeightPack, start_idx: int) -> None:
-        start_idx = start_idx // self.pack_factor
-        weight_pack.weight[:, start_idx : start_idx + weight.shape[1]].copy_(weight)
-        return
-
-    def load_weight_scale(self, weight_scale: torch.Tensor, weight_pack: WeightPack, start_idx: int) -> None:
-        weight_pack.weight_scale[:, start_idx : start_idx + weight_scale.shape[1]].copy_(weight_scale)
-        return
-
-    def load_weight_zero_point(self, weight_zero_point: torch.Tensor, weight_pack: WeightPack, start_idx: int) -> None:
-        start_idx = start_idx // self.pack_factor
-        end_idx = start_idx + weight_zero_point.shape[1]
-        weight_pack.weight_zero_point[:, start_idx:end_idx].copy_(weight_zero_point)
-        return
+        return WeightPack(weight=weight, weight_scale=weight_scale, weight_zero_point=weight_zero_point, fused_dim=1)
 
 
 @QUANTMETHODS.register("awq_marlin", platform="cuda")
@@ -235,10 +220,12 @@ def create_weight(
         weight_zero_point = torch.empty(
             expert_prefix + (in_dim // group_size, out_dim // self.pack_factor), dtype=torch.int32
         ).cuda(device_id)
-        return WeightPack(weight=weight, weight_scale=weight_scale, weight_zero_point=weight_zero_point)
+        return WeightPack(weight=weight, weight_scale=weight_scale, weight_zero_point=weight_zero_point, fused_dim=1)
 
-    def load_weight(self, weight: torch.Tensor, weight_pack: WeightPack, start_idx: int) -> None:
+    def load_weight(self, weight: torch.Tensor, weight_pack: WeightPack) -> None:
         assert self.hf_quantization_config is not None, "hf_quantization_config is not set"
+        if weight is None:
+            return
         device_id = get_current_device_id()
         repack_weight = vllm_ops.awq_marlin_repack(
             weight.cuda(device_id),
@@ -246,12 +233,13 @@ def load_weight(self, weight: torch.Tensor, weight_pack: WeightPack, start_idx:
             size_n=weight.shape[1] * self.pack_factor,
             num_bits=self.hf_quantization_config["bits"],
         )
-        start_idx = start_idx // self.pack_factor * self.tile_size
-        weight_pack.weight[:, start_idx : start_idx + repack_weight.shape[1]].copy_(repack_weight)
+        weight_pack.weight.copy_(repack_weight)
         return
 
-    def load_weight_scale(self, weight_scale: torch.Tensor, weight_pack: WeightPack, start_idx: int) -> None:
+    def load_weight_scale(self, weight_scale: torch.Tensor, weight_pack: WeightPack) -> None:
         assert self.hf_quantization_config is not None, "hf_quantization_config is not set"
+        if weight_scale is None:
+            return
         group_size = self.hf_quantization_config["group_size"]
         device_id = get_current_device_id()
         repack_weight_scale = marlin_permute_scales(
@@ -260,10 +248,12 @@ def load_weight_scale(self, weight_scale: torch.Tensor, weight_pack: WeightPack,
             size_n=weight_scale.shape[1],
             group_size=self.hf_quantization_config["group_size"],
         )
-        weight_pack.weight_scale[:, start_idx : start_idx + repack_weight_scale.shape[1]].copy_(repack_weight_scale)
+        weight_pack.weight_scale.copy_(repack_weight_scale)
         return
 
-    def load_weight_zero_point(self, weight_zero_point: torch.Tensor, weight_pack: WeightPack, start_idx: int) -> None:
+    def load_weight_zero_point(self, weight_zero_point: torch.Tensor, weight_pack: WeightPack) -> None:
+        if weight_zero_point is None:
+            return
         device_id = get_current_device_id()
         repack_weight_zero_point = awq_to_marlin_zero_points(
             weight_zero_point.cuda(device_id),
@@ -271,10 +261,7 @@ def load_weight_zero_point(self, weight_zero_point: torch.Tensor, weight_pack: W
             size_n=weight_zero_point.shape[1] * self.pack_factor,
             num_bits=self.hf_quantization_config["bits"],
         )
-        start_idx = start_idx // self.pack_factor
-        weight_pack.weight_zero_point[:, start_idx : start_idx + repack_weight_zero_point.shape[1]].copy_(
-            repack_weight_zero_point
-        )
+        weight_pack.weight_zero_point.copy_(repack_weight_zero_point)
         return
 
 
diff --git a/lightllm/common/quantization/deepgemm.py b/lightllm/common/quantization/deepgemm.py
index 80be14c33..e24898169 100644
--- a/lightllm/common/quantization/deepgemm.py
+++ b/lightllm/common/quantization/deepgemm.py
@@ -107,14 +107,6 @@ def create_weight(
         weight_scale = torch.empty(expert_prefix + (scale_out_dim, scale_in_dim), dtype=torch.float32).cuda(device_id)
         return WeightPack(weight=weight, weight_scale=weight_scale)
 
-    def load_weight(self, weight: torch.Tensor, weight_pack: WeightPack, start_idx: int) -> None:
-        weight_pack.weight[start_idx : start_idx + weight.shape[0]].copy_(weight)
-        return
-
-    def load_weight_scale(self, weight_scale: torch.Tensor, weight_pack: WeightPack, start_idx: int) -> None:
-        weight_pack.weight_scale[start_idx // self.block_size : start_idx + weight_scale.shape[0]].copy_(weight_scale)
-        return
-
 
 def _deepgemm_fp8_nt(a_tuple, b_tuple, out):
     if HAS_DEEPGEMM:
diff --git a/lightllm/common/quantization/no_quant.py b/lightllm/common/quantization/no_quant.py
index c05c90b21..e68d9ffa7 100644
--- a/lightllm/common/quantization/no_quant.py
+++ b/lightllm/common/quantization/no_quant.py
@@ -52,5 +52,7 @@ def method_name(self):
         return "none"
 
     def load_weight(self, weight: torch.Tensor, weight_pack: WeightPack, start_idx: int = 0) -> None:
-        weight_pack.weight[start_idx : start_idx + weight.shape[0], :].copy_(weight)
+        if weight is None:
+            return
+        weight_pack.weight.copy_(weight)
         return
diff --git a/lightllm/common/quantization/quantize_method.py b/lightllm/common/quantization/quantize_method.py
index 4350307f1..f685bd2d4 100644
--- a/lightllm/common/quantization/quantize_method.py
+++ b/lightllm/common/quantization/quantize_method.py
@@ -10,6 +10,7 @@ class WeightPack:
     weight: Optional[torch.Tensor] = None
     weight_scale: Optional[torch.Tensor] = None
     weight_zero_point: Optional[torch.Tensor] = None
+    fused_dim: Optional[int] = 0
 
     def get_expert(self, expert_idx: int):
         assert self.weight.ndim == 3, f"weight must be a 3D tensor, but got {self.weight.ndim}"
@@ -18,9 +19,29 @@ def get_expert(self, expert_idx: int):
         weight_zero_point = self.weight_zero_point[expert_idx] if self.weight_zero_point is not None else None
         return WeightPack(weight=weight, weight_scale=weight_scale, weight_zero_point=weight_zero_point)
 
-    def initialize_load_status(self, weight_num: int):
-        initial_loaded_status = [False, self.weight_scale is None, self.weight_zero_point is None]
-        self.load_ok = [initial_loaded_status.copy() for _ in range(weight_num)]
+    def create_cpu_buffer(self, weight_num: int):
+        self.weight_cpu_buffer = [None] * weight_num
+        self.weight_scale_cpu_buffer = [None] * weight_num
+        self.weight_zero_point_cpu_buffer = [None] * weight_num
+        self.load_ok = [False, self.weight_scale is None, self.weight_zero_point is None]
+        return
+
+    def get_fused_weight_part(self, weight_type) -> Optional[torch.Tensor]:
+        buffer_map = {
+            "weight": ("weight_cpu_buffer", 0),
+            "weight_scale": ("weight_scale_cpu_buffer", 1),
+            "weight_zero_point": ("weight_zero_point_cpu_buffer", 2),
+        }
+        buffer_name, index = buffer_map.get(weight_type)
+        if buffer_name is None:
+            raise ValueError(f"unknown weight type: {weight_type}")
+        cpu_buffer = getattr(self, buffer_name)
+        if None not in cpu_buffer:
+            fused = torch.cat(cpu_buffer, dim=self.fused_dim)
+            setattr(self, buffer_name, [None] * len(cpu_buffer))
+            self.load_ok[index] = True
+            return fused
+        return None
 
 
 class QuantizationMethod(ABC):
@@ -44,7 +65,6 @@ def quantize(
         self,
         weight: torch.Tensor,
         output: WeightPack,
-        offset: int = 0,
     ) -> None:
         pass
 
@@ -74,17 +94,20 @@ def weight_need_quanted(self, weight: torch.Tensor) -> bool:
         # 判断一个 weight 是否需要进行量化操作。
         return weight.dtype in [torch.bfloat16, torch.float16, torch.float32, torch.float64]
 
-    def load_weight(self, weight: torch.Tensor, weight_pack: WeightPack, start_idx: int) -> None:
-        raise NotImplementedError(
-            f"quantization method {self.method_name} is not supported to load offline quantized weight"
-        )
+    def load_weight(self, weight: torch.Tensor, weight_pack: WeightPack) -> None:
+        if weight is None:
+            return
+        weight_pack.weight.copy_(weight)
+        return
 
-    def load_weight_scale(self, weight_scale: torch.Tensor, weight_pack: WeightPack, start_idx: int) -> None:
-        raise NotImplementedError(
-            f"quantization method {self.method_name} is not supported to load offline quantized weight scale"
-        )
+    def load_weight_scale(self, weight_scale: torch.Tensor, weight_pack: WeightPack) -> None:
+        if weight_scale is None:
+            return
+        weight_pack.weight_scale.copy_(weight_scale)
+        return
 
-    def load_weight_zero_point(self, weight_zero_point: torch.Tensor, weight_pack: WeightPack, start_idx: int) -> None:
-        raise NotImplementedError(
-            f"quantization method {self.method_name} is not supported to load offline quantized weight zero point"
-        )
+    def load_weight_zero_point(self, weight_zero_point: torch.Tensor, weight_pack: WeightPack) -> None:
+        if weight_zero_point is None:
+            return
+        weight_pack.weight_zero_point.copy_(weight_zero_point)
+        return
diff --git a/lightllm/common/quantization/w8a8.py b/lightllm/common/quantization/w8a8.py
index 0a74d9887..5bb7243d6 100644
--- a/lightllm/common/quantization/w8a8.py
+++ b/lightllm/common/quantization/w8a8.py
@@ -68,13 +68,13 @@ def __init__(self):
         self.has_weight_scale = True
         self.has_weight_zero_point = False
 
-    def quantize(self, weight: torch.Tensor, output: WeightPack, offset: int = 0) -> None:
+    def quantize(self, weight: torch.Tensor, output: WeightPack) -> None:
         weight = weight.float().cuda(self.device_id_)
         scale = weight.abs().max(dim=-1)[0] / 127
         weight = weight / scale.reshape(-1, 1)
         weight = torch.round(weight.clamp(min=-128, max=127)).to(dtype=torch.int8)
-        output.weight[offset : offset + weight.shape[0]].copy_(weight)
-        output.weight_scale[offset : offset + weight.shape[0]].copy_(scale)
+        output.weight.copy_(weight)
+        output.weight_scale.copy_(scale)
         return
 
     def apply(
@@ -122,17 +122,17 @@ def __init__(self):
         self.has_weight_scale = True
         self.has_weight_zero_point = False
 
-    def quantize(self, weight: torch.Tensor, output: WeightPack, offset: int = 0) -> None:
+    def quantize(self, weight: torch.Tensor, output: WeightPack) -> None:
         if self.is_moe:
-            return self.quantize_moe(weight, output, offset)
+            return self.quantize_moe(weight, output)
         qweight, weight_scale = scaled_fp8_quant(
             weight.cuda(self.device_id_), scale=None, use_per_token_if_dynamic=True
         )
-        output.weight[offset : offset + qweight.shape[0], :].copy_(qweight)
-        output.weight_scale[offset : offset + weight_scale.shape[0]].copy_(weight_scale.view(-1))
+        output.weight.copy_(qweight)
+        output.weight_scale.copy_(weight_scale.view(-1))
         return
 
-    def quantize_moe(self, weight: torch.Tensor) -> WeightPack:
+    def quantize_moe(self, weight: torch.Tensor, output: WeightPack) -> WeightPack:
         num_experts = weight.shape[0]
         qweights = torch.empty_like(weight, dtype=torch.float8_e4m3fn).cuda(self.device_id_)
         weight_scales = []
@@ -143,7 +143,9 @@ def quantize_moe(self, weight: torch.Tensor) -> WeightPack:
             qweights[i] = qweight
             weight_scales.append(weight_scale)
         weight_scale = torch.stack(weight_scales, dim=0).contiguous()
-        return WeightPack(weight=qweights, weight_scale=weight_scale)
+        output.weight.copy_(qweights)
+        output.weight_scale.copy_(weight_scale.view(-1))
+        return
 
     def apply(
         self,
@@ -192,13 +194,13 @@ def __init__(self):
         self.has_weight_scale = True
         self.has_weight_zero_point = False
 
-    def quantize(self, weight: torch.Tensor, output: WeightPack, offset: int = 0) -> None:
+    def quantize(self, weight: torch.Tensor, output: WeightPack) -> None:
         from lightllm.common.quantization.triton_quant.fp8.fp8w8a8_block_quant_kernel import weight_quant
 
         device = output.weight.device
         weight, scale = weight_quant(weight.cuda(device), self.block_size)
-        output.weight[offset : offset + weight.shape[0], :].copy_(weight)
-        output.weight_scale[offset // self.block_size : offset + weight.shape[0] // self.block_size].copy_(scale)
+        output.weight.copy_(weight)
+        output.weight_scale.copy_(scale)
         return
 
     def apply(

From 98802c679c4f09cd0a3f810ccc226b6909c55fc9 Mon Sep 17 00:00:00 2001
From: wangzaijun <wangzaijun@sensetime.com>
Date: Mon, 26 Jan 2026 10:55:43 +0000
Subject: [PATCH 37/65] fix att sink + gpt oss moe

---
 .../meta_weights/att_sink_weight.py           | 13 ++--
 .../fused_moe/gpt_oss_fused_moe_weight_tp.py  | 69 ++++++++++++-------
 .../layer_weights/transformer_layer_weight.py | 13 ++--
 lightllm/models/gpt_oss/model.py              | 17 +++--
 4 files changed, 67 insertions(+), 45 deletions(-)

diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/att_sink_weight.py b/lightllm/common/basemodel/layer_weights/meta_weights/att_sink_weight.py
index f3952992c..2013d55be 100644
--- a/lightllm/common/basemodel/layer_weights/meta_weights/att_sink_weight.py
+++ b/lightllm/common/basemodel/layer_weights/meta_weights/att_sink_weight.py
@@ -5,21 +5,22 @@
 
 
 class TpAttSinkWeight(BaseWeightTpl):
-    def __init__(self, all_kv_head_num: int, head_dim: int, weight_name: str, data_type):
+    def __init__(self, all_q_head_num: int, weight_name: str, data_type):
         super().__init__()
-        self.all_kv_head_num = all_kv_head_num
-        self.head_dim = head_dim
+        self.all_q_head_num = all_q_head_num
         self.weight_name = weight_name
         self.data_type_ = data_type
-        self._start_head_index, self._end_head_index = self._get_head_tp_split_params(all_head_num=self.all_kv_head_num)
+        self._start_head_index, self._end_head_index = self._get_head_tp_split_params(all_head_num=self.all_q_head_num)
         self._create_weight()
 
     def _create_weight(self):
-        self.weight = torch.empty((self.all_kv_head_num, self.head_dim), dtype=self.data_type_, device="cuda")
+        self.weight = torch.empty(
+            (self._end_head_index - self._start_head_index,), dtype=self.data_type_, device="cuda"
+        )
         self.weight.load_ok = False
 
     def load_hf_weights(self, weights: Dict[str, torch.Tensor]):
-        if self.weight_name not in weights or self.weight is not None:
+        if self.weight_name not in weights:
             return
 
         t_weight = weights[self.weight_name]
diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/gpt_oss_fused_moe_weight_tp.py b/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/gpt_oss_fused_moe_weight_tp.py
index e7748b1df..129c787f9 100644
--- a/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/gpt_oss_fused_moe_weight_tp.py
+++ b/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/gpt_oss_fused_moe_weight_tp.py
@@ -6,6 +6,7 @@
 from lightllm.common.basemodel.layer_weights.meta_weights.fused_moe.fused_moe_weight import FusedMoeWeight
 from lightllm.utils.dist_utils import get_current_rank_in_dp, get_current_device_id
 from lightllm.common.quantization import Quantcfg
+from lightllm.common.quantization.quantize_method import QuantizationMethod
 from lightllm.utils.log_utils import init_logger
 
 logger = init_logger(__name__)
@@ -33,38 +34,41 @@
 class GPTOSSFusedMoeWeightTP(FusedMoeWeight):
     def __init__(
         self,
-        gate_up_proj_name: str,  # diff with FusedMoeWeightTP
+        gate_up_proj_name: str,
         down_proj_name: str,
         e_score_correction_bias_name: str,
         weight_prefix: str,
         n_routed_experts: int,
-        num_fused_shared_experts: int,
-        split_inter_size: int,
+        hidden_size: int,
+        moe_intermediate_size: int,
         data_type: torch.dtype,
-        network_config: Dict[str, Any],
-        layer_num: int,
-        world_size: int = 1,  # diff with FusedMoeWeightTP
-        quant_cfg: Quantcfg = None,
+        quant_method: QuantizationMethod = None,
+        num_fused_shared_experts: int = 0,
+        layer_num: int = 0,
+        network_config: Dict[str, Any] = None,
     ) -> None:
+        network_config["norm_topk_prob"] = None
         super().__init__(
-            gate_up_proj_name,
-            down_proj_name,
-            gate_up_proj_name,
-            e_score_correction_bias_name,
-            weight_prefix,
-            n_routed_experts,
-            num_fused_shared_experts,
-            split_inter_size,
-            data_type,
-            network_config,
-            layer_num,
-            quant_cfg,
+            gate_proj_name=gate_up_proj_name,
+            down_proj_name=down_proj_name,
+            up_proj_name=gate_up_proj_name,
+            e_score_correction_bias_name=e_score_correction_bias_name,
+            weight_prefix=weight_prefix,
+            n_routed_experts=n_routed_experts,
+            hidden_size=hidden_size,
+            moe_intermediate_size=moe_intermediate_size,
+            data_type=data_type,
+            quant_method=quant_method,
+            num_fused_shared_experts=num_fused_shared_experts,
+            layer_num=layer_num,
+            network_config=network_config,
         )
+        del self.w13, self.w2
+
         self.hidden_size = network_config["hidden_size"]
 
         self.alpha = 1.702
         self.limit = 7.0
-        self.tp_world_size_ = world_size
 
         self.w1_bias = None
         self.w2_bias = None
@@ -116,22 +120,34 @@ def load_hf_weights(self, weights):
             w2_bias = weights[self._down_bias_name]
             self.w2_bias = self._cuda(w2_bias)
 
-    def router(self, router_logits, top_k):
+    def _router(self, router_logits, top_k):
         router_top_value, router_indices = torch.topk(router_logits, top_k, dim=-1)
         router_top_value = torch.nn.functional.softmax(router_top_value, dim=1, dtype=router_top_value.dtype)
         return router_top_value, router_indices
 
-    def experts(self, input_tensor, router_logits, top_k, renormalize, use_grouped_topk, topk_group, num_expert_group):
-        topk_weights, topk_ids = self.router(router_logits, top_k)
+    def experts(
+        self,
+        input_tensor: torch.Tensor,
+        router_logits: torch.Tensor,
+        top_k: int,
+        renormalize: bool,
+        use_grouped_topk: bool,
+        topk_group: int,
+        num_expert_group: int,
+        is_prefill: Optional[bool] = None,
+    ):
+
+        topk_weights, topk_ids = self._router(router_logits, top_k)
 
         w1, w1_scale = self.w1
         w2, w2_scale = self.w2
         use_fp8_w8a8 = self.quant_method is not None
+        use_fp8_w8a8 = False  # TODO: disable fp8 for GPT-OSS for now
 
-        from lightllm.common.fused_moe.grouped_fused_moe import fused_experts
+        from lightllm.common.basemodel.triton_kernel.fused_moe.grouped_fused_moe import fused_experts
 
         output_tensor = fused_experts(
-            hidden_states=input_tensor.to(torch.bfloat16),
+            hidden_states=input_tensor.to(w1.dtype),
             w1=w1,
             w2=w2,
             topk_weights=topk_weights,
@@ -201,3 +217,6 @@ def _convert_moe_packed_tensors(
         out = out.reshape(*prefix_shape, G, B * 2).view(*prefix_shape, G * B * 2)
         del blocks, scales, lut
         return out.transpose(1, 2).contiguous()
+
+    def _cuda(self, cpu_tensor):
+        return cpu_tensor.contiguous().to(self.data_type_).cuda(get_current_device_id())
diff --git a/lightllm/models/gpt_oss/layer_weights/transformer_layer_weight.py b/lightllm/models/gpt_oss/layer_weights/transformer_layer_weight.py
index abf2b07e9..7c8c30940 100644
--- a/lightllm/models/gpt_oss/layer_weights/transformer_layer_weight.py
+++ b/lightllm/models/gpt_oss/layer_weights/transformer_layer_weight.py
@@ -48,13 +48,13 @@ def _init_moe(self):
             e_score_correction_bias_name="",
             weight_prefix=f"model.layers.{self.layer_num_}.mlp.experts",
             n_routed_experts=n_routed_experts,
-            split_inter_size=moe_intermediate_size // self.tp_world_size_,
+            hidden_size=self.n_embed,
+            moe_intermediate_size=moe_intermediate_size,
             data_type=self.data_type_,
-            network_config=self.network_config_,
-            layer_num=self.layer_num_,
-            world_size=self.tp_world_size_,  # diff with FusedMoeWeightTP
-            quant_cfg=self.quant_cfg,
+            quant_method=self.quant_cfg.get_quant_method(self.layer_num_, "fused_moe"),
             num_fused_shared_experts=0,
+            layer_num=self.layer_num_,
+            network_config=self.network_config_,
         )
 
     def _init_weight_names(self):
@@ -72,8 +72,7 @@ def _init_weight(self):
         super()._init_weight()
 
         self.attn_sinks = TpAttSinkWeight(
-            all_kv_head_num=self.q_head_num_ + self.k_head_num_,
-            head_dim=self.head_dim,
+            all_q_head_num=self.q_head_num_,
             weight_name=f"model.layers.{self.layer_num_}.self_attn.sinks",
             data_type=torch.bfloat16,
         )
diff --git a/lightllm/models/gpt_oss/model.py b/lightllm/models/gpt_oss/model.py
index dc5f2abdf..9e9561eb2 100644
--- a/lightllm/models/gpt_oss/model.py
+++ b/lightllm/models/gpt_oss/model.py
@@ -2,9 +2,10 @@
 from lightllm.models.gpt_oss.layer_weights.transformer_layer_weight import GptOssTransformerLayerWeight
 from lightllm.models.llama.model import LlamaTpPartModel
 from lightllm.models.registry import ModelRegistry
-
 from lightllm.utils.envs_utils import get_env_start_args
 from lightllm.utils.log_utils import init_logger
+from lightllm.common.basemodel.attention import get_prefill_att_backend_class, get_decode_att_backend_class
+from lightllm.common.basemodel.attention import BaseAttBackend
 
 logger = init_logger(__name__)
 
@@ -19,9 +20,11 @@ class GptOssTpPartModel(LlamaTpPartModel):
 
     def __init__(self, kvargs):
         super().__init__(kvargs)
-        assert (
-            get_env_start_args().llm_prefill_att_backend[0] == "fa3"
-        ), "For now GPT-OSS type model only support flashattention-3"
-        assert (
-            get_env_start_args().llm_decode_att_backend[0] == "fa3"
-        ), "For now GPT-OSS type model only support flashattention-3"
+
+    def _init_att_backend(self):
+        self.prefill_att_backend: BaseAttBackend = get_prefill_att_backend_class(index=0, priority_list=["fa3"])(
+            model=self
+        )
+        self.decode_att_backend: BaseAttBackend = get_decode_att_backend_class(index=0, priority_list=["fa3"])(
+            model=self
+        )

From 84486f307058edeff7da56ea869a882ec4471d1b Mon Sep 17 00:00:00 2001
From: shihaobai <1798930569@qq.com>
Date: Mon, 26 Jan 2026 11:42:54 +0000
Subject: [PATCH 38/65] simplify fuse_moe

---
 .../fused_moe/fused_moe_weight.py             | 117 +++++++++++-------
 .../common/quantization/quantize_method.py    |  14 ++-
 2 files changed, 81 insertions(+), 50 deletions(-)

diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/fused_moe_weight.py b/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/fused_moe_weight.py
index a492d82ef..95f793052 100644
--- a/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/fused_moe_weight.py
+++ b/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/fused_moe_weight.py
@@ -310,71 +310,92 @@ def _load_weight(self, expert_idx_to_local_idx: Dict[int, int], weights: Dict[st
         # Load each expert with TP slicing
         for expert_idx, local_expert_idx in expert_idx_to_local_idx.items():
             with self.lock:
-                self._load_expert(
-                    expert_idx, local_expert_idx, weights, type="weight", suffix=self.quant_method.weight_suffix
+                self._load_expert(expert_idx, local_expert_idx, weights)
+                self._load_expert_scale(
+                    expert_idx,
+                    local_expert_idx,
+                    weights,
+                )
+                self._load_expert_zero_point(
+                    expert_idx,
+                    local_expert_idx,
+                    weights,
                 )
-            if self.w13.weight_scale is not None:
-                with self.lock:
-                    self._load_expert(
-                        expert_idx,
-                        local_expert_idx,
-                        weights,
-                        type="weight_scale",
-                        suffix=self.quant_method.weight_scale_suffix,
-                    )
-            if self.w13.weight_zero_point is not None:
-                with self.lock:
-                    self._load_expert(
-                        expert_idx,
-                        local_expert_idx,
-                        weights,
-                        type="weight_zero_point",
-                        suffix=self.quant_method.weight_zero_point_suffix,
-                    )
 
     def _load_expert(
         self,
         expert_idx: int,
         local_expert_idx: int,
         weights: Dict[str, torch.Tensor],
-        type: str,
-        suffix: str = "weight",
     ):
-        w1_weight = f"{self.weight_prefix}.{expert_idx}.{self.w1_weight_name}.{suffix}"
-        w2_weight = f"{self.weight_prefix}.{expert_idx}.{self.w2_weight_name}.{suffix}"
-        w3_weight = f"{self.weight_prefix}.{expert_idx}.{self.w3_weight_name}.{suffix}"
-        load_func = self._get_load_func(type)
-        row_slice_func = self._get_slice_func(self.row_slicer, type)
-        col_slice_func = self._get_slice_func(self.col_slicer, type)
+        w1_weight = f"{self.weight_prefix}.{expert_idx}.{self.w1_weight_name}.{self.quant_method.weight_suffix}"
+        w2_weight = f"{self.weight_prefix}.{expert_idx}.{self.w2_weight_name}.{self.quant_method.weight_suffix}"
+        w3_weight = f"{self.weight_prefix}.{expert_idx}.{self.w3_weight_name}.{self.quant_method.weight_suffix}"
+        row_slice_func = self.row_slicer._slice_weight
+        col_slice_func = self.col_slicer._slice_weight
         if w1_weight in weights:
             self.w13_list[local_expert_idx].weight_cpu_buffer[0] = row_slice_func(weights[w1_weight])
         if w3_weight in weights:
             self.w13_list[local_expert_idx].weight_cpu_buffer[1] = row_slice_func(weights[w3_weight])
-        w13_weight = self.w13_list[local_expert_idx].get_fused_weight_part(suffix)
-        load_func(w13_weight, self.w13_list[local_expert_idx])
+        w13_weight = self.w13_list[local_expert_idx].get_fused_weight_part("weight")
+        self._load_weight_func(w13_weight, self.w13_list[local_expert_idx])
         if w2_weight in weights:
             self.w2_list[local_expert_idx].weight_cpu_buffer[0] = col_slice_func(weights[w2_weight])
-        w2_weight = self.w2_list[local_expert_idx].get_fused_weight_part(suffix)
-        load_func(w2_weight, self.w2_list[local_expert_idx])
+            w2_weight = self.w2_list[local_expert_idx].get_fused_weight_part("weight")
+            self._load_weight_func(w2_weight, self.w2_list[local_expert_idx])
+
+    def _load_expert_scale(
+        self,
+        expert_idx: int,
+        local_expert_idx: int,
+        weights: Dict[str, torch.Tensor],
+    ):
+        w1_scale = f"{self.weight_prefix}.{expert_idx}.{self.w1_weight_name}.{self.quant_method.weight_scale_suffix}"
+        w2_scale = f"{self.weight_prefix}.{expert_idx}.{self.w2_weight_name}.{self.quant_method.weight_scale_suffix}"
+        w3_scale = f"{self.weight_prefix}.{expert_idx}.{self.w3_weight_name}.{self.quant_method.weight_scale_suffix}"
+        row_slice_func = self.row_slicer._slice_weight_scale
+        col_slice_func = self.col_slicer._slice_weight_scale
+        if w1_scale in weights:
+            self.w13_list[local_expert_idx].weight_scale_cpu_buffer[0] = row_slice_func(weights[w1_scale])
+        if w3_scale in weights:
+            self.w13_list[local_expert_idx].weight_scale_cpu_buffer[1] = row_slice_func(weights[w3_scale])
+        w13_scale = self.w13_list[local_expert_idx].get_fused_weight_part("weight_scale")
+        self.quant_method.load_weight_scale(w13_scale, self.w13_list[local_expert_idx])
+        if w2_scale in weights:
+            self.w2_list[local_expert_idx].weight_scale_cpu_buffer[0] = col_slice_func(weights[w2_scale])
+        w2_scale = self.w2_list[local_expert_idx].get_fused_weight_part("weight_scale")
+        self.quant_method.load_weight_scale(w2_scale, self.w2_list[local_expert_idx])
+
+    def _load_expert_zero_point(
+        self,
+        expert_idx: int,
+        local_expert_idx: int,
+        weights: Dict[str, torch.Tensor],
+    ):
+        w1_zero_point = (
+            f"{self.weight_prefix}.{expert_idx}.{self.w1_weight_name}.{self.quant_method.weight_zero_point_suffix}"
+        )
+        w2_zero_point = (
+            f"{self.weight_prefix}.{expert_idx}.{self.w2_weight_name}.{self.quant_method.weight_zero_point_suffix}"
+        )
+        w3_zero_point = (
+            f"{self.weight_prefix}.{expert_idx}.{self.w3_weight_name}.{self.quant_method.weight_zero_point_suffix}"
+        )
+        row_slice_func = self.row_slicer._slice_weight_zero_point
+        col_slice_func = self.col_slicer._slice_weight_zero_point
+        if w1_zero_point in weights:
+            self.w13_list[local_expert_idx].weight_zero_point_cpu_buffer[0] = row_slice_func(weights[w1_zero_point])
+        if w3_zero_point in weights:
+            self.w13_list[local_expert_idx].weight_zero_point_cpu_buffer[1] = row_slice_func(weights[w3_zero_point])
+        w13_zero_point = self.w13_list[local_expert_idx].get_fused_weight_part("weight_zero_point")
+        self.quant_method.load_weight_zero_point(w13_zero_point, self.w13_list[local_expert_idx])
+        if w2_zero_point in weights:
+            self.w2_list[local_expert_idx].weight_zero_point_cpu_buffer[0] = col_slice_func(weights[w2_zero_point])
+        w2_zero_point = self.w2_list[local_expert_idx].get_fused_weight_part("weight_zero_point")
+        self.quant_method.load_weight_zero_point(w2_zero_point, self.w2_list[local_expert_idx])
 
     def _load_weight_func(self, weight: torch.Tensor, weight_pack: WeightPack):
         if self.quant_method.weight_need_quanted(weight):
             self.quant_method.quantize(weight, weight_pack)
         else:
             self.quant_method.load_weight(weight, weight_pack)
-
-    def _get_load_func(self, type: str):
-        if type == "weight":
-            return self._load_weight_func
-        elif type == "weight_scale":
-            return getattr(self.quant_method, "load_weight_scale")
-        elif type == "weight_zero_point":
-            return getattr(self.quant_method, "load_weight_zero_point")
-
-    def _get_slice_func(self, slicer: SliceMixinTpl, type: str):
-        if type == "weight":
-            return slicer._slice_weight
-        elif type == "weight_scale":
-            return slicer._slice_weight_scale
-        elif type == "weight_zero_point":
-            return slicer._slice_weight_zero_point
diff --git a/lightllm/common/quantization/quantize_method.py b/lightllm/common/quantization/quantize_method.py
index f685bd2d4..296173e5a 100644
--- a/lightllm/common/quantization/quantize_method.py
+++ b/lightllm/common/quantization/quantize_method.py
@@ -17,7 +17,9 @@ def get_expert(self, expert_idx: int):
         weight = self.weight[expert_idx]
         weight_scale = self.weight_scale[expert_idx] if self.weight_scale is not None else None
         weight_zero_point = self.weight_zero_point[expert_idx] if self.weight_zero_point is not None else None
-        return WeightPack(weight=weight, weight_scale=weight_scale, weight_zero_point=weight_zero_point)
+        return WeightPack(
+            weight=weight, weight_scale=weight_scale, weight_zero_point=weight_zero_point, fused_dim=self.fused_dim
+        )
 
     def create_cpu_buffer(self, weight_num: int):
         self.weight_cpu_buffer = [None] * weight_num
@@ -37,7 +39,13 @@ def get_fused_weight_part(self, weight_type) -> Optional[torch.Tensor]:
             raise ValueError(f"unknown weight type: {weight_type}")
         cpu_buffer = getattr(self, buffer_name)
         if None not in cpu_buffer:
-            fused = torch.cat(cpu_buffer, dim=self.fused_dim)
+            try:
+                fused = torch.cat(cpu_buffer, dim=self.fused_dim)
+            except Exception as e:
+                print(len(cpu_buffer), self.fused_dim)
+                for buff in cpu_buffer:
+                    print(buff.shape)
+                raise e
             setattr(self, buffer_name, [None] * len(cpu_buffer))
             self.load_ok[index] = True
             return fused
@@ -91,6 +99,8 @@ def create_weight(
         pass
 
     def weight_need_quanted(self, weight: torch.Tensor) -> bool:
+        if weight is None:
+            return False
         # 判断一个 weight 是否需要进行量化操作。
         return weight.dtype in [torch.bfloat16, torch.float16, torch.float32, torch.float64]
 

From ee9dc78e83c2d531b828f190358a2240b1fdc406 Mon Sep 17 00:00:00 2001
From: shihaobai <1798930569@qq.com>
Date: Mon, 26 Jan 2026 13:58:19 +0000
Subject: [PATCH 39/65] remove weight cpu buffer and add weight_list

---
 .../fused_moe/fused_moe_weight.py             | 47 ++++-----
 .../fused_moe/gpt_oss_fused_moe_weight_tp.py  |  4 +-
 .../meta_weights/mm_weight/mm_weight.py       | 42 ++++----
 lightllm/common/quantization/awq.py           | 39 +++-----
 lightllm/common/quantization/deepgemm.py      |  7 +-
 lightllm/common/quantization/no_quant.py      | 14 ++-
 .../common/quantization/quantize_method.py    | 97 +++++++++++--------
 lightllm/common/quantization/w8a8.py          | 60 +++++-------
 8 files changed, 151 insertions(+), 159 deletions(-)

diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/fused_moe_weight.py b/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/fused_moe_weight.py
index 95f793052..e945be587 100644
--- a/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/fused_moe_weight.py
+++ b/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/fused_moe_weight.py
@@ -279,29 +279,28 @@ def _create_weight(self):
                 device=f"cuda:{self.device_id_}",
             )
 
-        self.w13: WeightPack = self.quant_method.create_weight(
-            out_dim=intermediate_size * 2,
+        self.w13, w13_param_list = self.quant_method.create_moe_weight(
+            out_dims=[intermediate_size, intermediate_size],
             in_dim=self.hidden_size,
             dtype=self.data_type_,
             device_id=self.device_id_,
             num_experts=self.local_n_routed_experts,
         )
-        self.w13_list: List[WeightPack] = self._get_expert_weight_list(self.w13, 2)
-        self.w2: WeightPack = self.quant_method.create_weight(
-            out_dim=self.hidden_size,
+        self.w2, _ = self.quant_method.create_moe_weight(
+            out_dims=[self.hidden_size],
             in_dim=intermediate_size,
             dtype=self.data_type_,
             device_id=self.device_id_,
             num_experts=self.local_n_routed_experts,
         )
-        self.w2_list: List[WeightPack] = self._get_expert_weight_list(self.w2, 1)
-        self.load_cnt = 0
+        self.w1_list: List[WeightPack] = self._get_expert_weight_list(w13_param_list[0])
+        self.w3_list: List[WeightPack] = self._get_expert_weight_list(w13_param_list[1])
+        self.w2_list: List[WeightPack] = self._get_expert_weight_list(self.w2)
 
-    def _get_expert_weight_list(self, weight_pack: WeightPack, weight_num: int = 1):
+    def _get_expert_weight_list(self, weight_pack: WeightPack):
         weight_list = []
         for idx in range(self.local_n_routed_experts):
             expert_weight = weight_pack.get_expert(idx)
-            expert_weight.create_cpu_buffer(weight_num)
             weight_list.append(expert_weight)
         return weight_list
 
@@ -334,15 +333,11 @@ def _load_expert(
         row_slice_func = self.row_slicer._slice_weight
         col_slice_func = self.col_slicer._slice_weight
         if w1_weight in weights:
-            self.w13_list[local_expert_idx].weight_cpu_buffer[0] = row_slice_func(weights[w1_weight])
+            self._load_weight_func(row_slice_func(weights[w1_weight]), self.w1_list[local_expert_idx])
         if w3_weight in weights:
-            self.w13_list[local_expert_idx].weight_cpu_buffer[1] = row_slice_func(weights[w3_weight])
-        w13_weight = self.w13_list[local_expert_idx].get_fused_weight_part("weight")
-        self._load_weight_func(w13_weight, self.w13_list[local_expert_idx])
+            self._load_weight_func(row_slice_func(weights[w3_weight]), self.w3_list[local_expert_idx])
         if w2_weight in weights:
-            self.w2_list[local_expert_idx].weight_cpu_buffer[0] = col_slice_func(weights[w2_weight])
-            w2_weight = self.w2_list[local_expert_idx].get_fused_weight_part("weight")
-            self._load_weight_func(w2_weight, self.w2_list[local_expert_idx])
+            self._load_weight_func(col_slice_func(weights[w2_weight]), self.w2_list[local_expert_idx])
 
     def _load_expert_scale(
         self,
@@ -356,15 +351,11 @@ def _load_expert_scale(
         row_slice_func = self.row_slicer._slice_weight_scale
         col_slice_func = self.col_slicer._slice_weight_scale
         if w1_scale in weights:
-            self.w13_list[local_expert_idx].weight_scale_cpu_buffer[0] = row_slice_func(weights[w1_scale])
+            self._load_weight_scale_func(row_slice_func(weights[w1_scale]), self.w1_list[local_expert_idx])
         if w3_scale in weights:
-            self.w13_list[local_expert_idx].weight_scale_cpu_buffer[1] = row_slice_func(weights[w3_scale])
-        w13_scale = self.w13_list[local_expert_idx].get_fused_weight_part("weight_scale")
-        self.quant_method.load_weight_scale(w13_scale, self.w13_list[local_expert_idx])
+            self._load_weight_scale_func(row_slice_func(weights[w3_scale]), self.w3_list[local_expert_idx])
         if w2_scale in weights:
-            self.w2_list[local_expert_idx].weight_scale_cpu_buffer[0] = col_slice_func(weights[w2_scale])
-        w2_scale = self.w2_list[local_expert_idx].get_fused_weight_part("weight_scale")
-        self.quant_method.load_weight_scale(w2_scale, self.w2_list[local_expert_idx])
+            self._load_weight_scale_func(col_slice_func(weights[w2_scale]), self.w2_list[local_expert_idx])
 
     def _load_expert_zero_point(
         self,
@@ -384,15 +375,11 @@ def _load_expert_zero_point(
         row_slice_func = self.row_slicer._slice_weight_zero_point
         col_slice_func = self.col_slicer._slice_weight_zero_point
         if w1_zero_point in weights:
-            self.w13_list[local_expert_idx].weight_zero_point_cpu_buffer[0] = row_slice_func(weights[w1_zero_point])
+            self._load_weight_zero_point_func(row_slice_func(weights[w1_zero_point]), self.w1_list[local_expert_idx])
         if w3_zero_point in weights:
-            self.w13_list[local_expert_idx].weight_zero_point_cpu_buffer[1] = row_slice_func(weights[w3_zero_point])
-        w13_zero_point = self.w13_list[local_expert_idx].get_fused_weight_part("weight_zero_point")
-        self.quant_method.load_weight_zero_point(w13_zero_point, self.w13_list[local_expert_idx])
+            self._load_weight_zero_point_func(row_slice_func(weights[w3_zero_point]), self.w3_list[local_expert_idx])
         if w2_zero_point in weights:
-            self.w2_list[local_expert_idx].weight_zero_point_cpu_buffer[0] = col_slice_func(weights[w2_zero_point])
-        w2_zero_point = self.w2_list[local_expert_idx].get_fused_weight_part("weight_zero_point")
-        self.quant_method.load_weight_zero_point(w2_zero_point, self.w2_list[local_expert_idx])
+            self._load_weight_zero_point_func(col_slice_func(weights[w2_zero_point]), self.w2_list[local_expert_idx])
 
     def _load_weight_func(self, weight: torch.Tensor, weight_pack: WeightPack):
         if self.quant_method.weight_need_quanted(weight):
diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/gpt_oss_fused_moe_weight_tp.py b/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/gpt_oss_fused_moe_weight_tp.py
index 129c787f9..666419f9c 100644
--- a/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/gpt_oss_fused_moe_weight_tp.py
+++ b/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/gpt_oss_fused_moe_weight_tp.py
@@ -63,7 +63,6 @@ def __init__(
             layer_num=layer_num,
             network_config=network_config,
         )
-        del self.w13, self.w2
 
         self.hidden_size = network_config["hidden_size"]
 
@@ -81,6 +80,9 @@ def __init__(
         self._gate_up_scales_name = f"{weight_prefix}.{gate_up_proj_name}_scales"
         return
 
+    def _create_weight(self):
+        pass
+
     def _fuse_weight_scale(self):
         assert False, "Not implemented for GPT-OSS."
 
diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/mm_weight.py b/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/mm_weight.py
index de3b36669..912718925 100644
--- a/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/mm_weight.py
+++ b/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/mm_weight.py
@@ -34,9 +34,6 @@ def __init__(
         if isinstance(out_dims, int):
             out_dims = [out_dims]
         self.out_dims = out_dims
-        self.cusum_out_dims = [0]
-        for out_dim in out_dims[:-1]:
-            self.cusum_out_dims.append(self.cusum_out_dims[-1] + out_dim)
 
         if isinstance(weight_names, str):
             weight_names = [weight_names]
@@ -117,13 +114,14 @@ def _create_weight(self):
         self.bias = None
         if self.bias_names is not None:
             self.bias = torch.empty(sum(self.out_dims), dtype=self.data_type_).cuda(get_current_device_id())
+            # bias_list shares storage with bias for each output shard
+            self.bias_list = torch.split(self.bias, self.out_dims, dim=0)
             self.bias._load_ok = [False] * len(self.bias_names)
-        self.mm_param: WeightPack = self.quant_method.create_weight(
-            in_dim=self.in_dim, out_dim=sum(self.out_dims), dtype=self.data_type_, device_id=get_current_device_id()
+        self.mm_param: WeightPack = None
+        self.mm_param_list: List[WeightPack] = None
+        self.mm_param, self.mm_param_list = self.quant_method.create_weight(
+            in_dim=self.in_dim, out_dims=self.out_dims, dtype=self.data_type_, device_id=get_current_device_id()
         )
-        # For fused weights such as gate_up_proj, we first load them into a CPU buffer
-        # for online quantization (e.g., per-tensor quantization).
-        self.mm_param.create_cpu_buffer(len(self.weight_names))
         return
 
     # 执行顺序
@@ -132,12 +130,14 @@ def _load_weight(
     ) -> None:
         if param_name in weights:
             weight = self.param_slicer._slice_weight(weights[param_name])
-            self.mm_param.weight_cpu_buffer[sub_child_index] = weight
-            weight = self.mm_param.get_fused_weight_part("weight")
             if self.quant_method.weight_need_quanted(weight):
-                self.quant_method.quantize(weight, self.mm_param)
+                self.quant_method.quantize(weight, self.mm_param_list[sub_child_index])
+                # online quantization, so we need to set the load_ok for weight_scale and weight_zero_point.
+                self.mm_param_list[sub_child_index].load_ok[1] = True
+                self.mm_param_list[sub_child_index].load_ok[2] = True
             else:
-                self.quant_method.load_weight(weight, self.mm_param)
+                self.quant_method.load_weight(weight, self.mm_param_list[sub_child_index])
+            self.mm_param_list[sub_child_index].load_ok[0] = True
         return
 
     def _load_bias(
@@ -145,10 +145,8 @@ def _load_bias(
     ) -> None:
         if param_name in weights:
             bias = self.param_slicer._slice_bias(weights[param_name])
-            start_idx = self.cusum_out_dims[sub_child_index]
-            end_idx = start_idx + bias.shape[0]
-            self.bias[start_idx:end_idx].copy_(bias)
-            self.bias._load_ok[sub_child_index] = True
+            self.bias_list[sub_child_index].copy_(bias)
+            self.bias_list[sub_child_index]._load_ok = True
         return
 
     def _load_weight_scale(
@@ -156,9 +154,7 @@ def _load_weight_scale(
     ) -> None:
         if param_name in weights:
             weight_scale = self.param_slicer._slice_weight_scale(weights[param_name])
-            self.mm_param.weight_scale_cpu_buffer[sub_child_index] = weight_scale
-            weight_scale = self.mm_param.get_fused_weight_part("weight_scale")
-            self.quant_method.load_weight_scale(weight_scale, self.mm_param)
+            self.quant_method.load_weight_scale(weight_scale, self.mm_param_list[sub_child_index])
         return
 
     def _load_weight_zero_point(
@@ -166,16 +162,14 @@ def _load_weight_zero_point(
     ) -> None:
         if param_name in weights:
             weight_zero_point = self.param_slicer._slice_weight_zero_point(weights[param_name])
-            self.mm_param.weight_zero_point_cpu_buffer[sub_child_index] = weight_zero_point
-            weight_zero_point = self.mm_param.get_fused_weight_part("weight_zero_point")
-            self.quant_method.load_weight_zero_point(weight_zero_point, self.mm_param)
+            self.quant_method.load_weight_zero_point(weight_zero_point, self.mm_param_list[sub_child_index])
         return
 
     def verify_load(self):
-        mm_param_load_ok = all(self.mm_param.load_ok)
+        mm_param_load_ok = all(all(_mm_param.load_ok) for _mm_param in self.mm_param_list)
         bias_load_ok = True if self.bias is None else all(self.bias._load_ok)
         if not (mm_param_load_ok and bias_load_ok):
-            logger.warning(f"mm_param_load_ok: {self.mm_param.load_ok}, bias_load_ok: {self.bias}")
+            logger.warning(f"mm_param_load_ok: {self.mm_param_list[0].load_ok}")
         return mm_param_load_ok and bias_load_ok
 
     def _get_tp_dim(self, dim: int) -> int:
diff --git a/lightllm/common/quantization/awq.py b/lightllm/common/quantization/awq.py
index bbf759757..e5d0b4519 100644
--- a/lightllm/common/quantization/awq.py
+++ b/lightllm/common/quantization/awq.py
@@ -1,5 +1,5 @@
 import torch
-from typing import Any, Optional, Tuple
+from typing import Any, Optional, Tuple, List
 
 from lightllm.common.quantization.quantize_method import QuantizationMethod, WeightPack
 from lightllm.common.quantization.registry import QUANTMETHODS
@@ -108,9 +108,10 @@ def apply(
             out.add_(bias)
         return out
 
-    def create_weight(
-        self, out_dim: int, in_dim: int, dtype: torch.dtype, device_id: int, num_experts: int = 1
-    ) -> WeightPack:
+    def _create_weight(
+        self, out_dims: List[int], in_dim: int, dtype: torch.dtype, device_id: int, num_experts: int = 1
+    ) -> Tuple[WeightPack, List[WeightPack]]:
+        out_dim = sum(out_dims)
         group_size = self.hf_quantization_config["group_size"]
         expert_prefix = (num_experts,) if num_experts > 1 else ()
         weight = torch.empty(expert_prefix + (in_dim, out_dim // self.pack_factor), dtype=torch.int32).cuda(device_id)
@@ -118,7 +119,9 @@ def create_weight(
         weight_zero_point = torch.empty(
             expert_prefix + (in_dim // group_size, out_dim // self.pack_factor), dtype=torch.int32
         ).cuda(device_id)
-        return WeightPack(weight=weight, weight_scale=weight_scale, weight_zero_point=weight_zero_point, fused_dim=1)
+        mm_param = WeightPack(weight=weight, weight_scale=weight_scale, weight_zero_point=weight_zero_point)
+        mm_param_list = self._split_weight_pack(mm_param, out_dims, weight_split_dim=-1, weight_scale_split_dim=-1)
+        return mm_param, mm_param_list
 
 
 @QUANTMETHODS.register("awq_marlin", platform="cuda")
@@ -145,21 +148,6 @@ def method_name(self):
     def quantize(self, weight: torch.Tensor, offset: int = 0) -> WeightPack:
         raise NotImplementedError("AWQ online quantization is not supported yet.")
 
-    def params_repack(
-        self, weight: torch.Tensor, weight_scale: torch.Tensor, weight_zero_point: torch.Tensor, dtype_type: torch.dtype
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        """
-        一些量化方法在将参数完成量化后，为了加速性能，还需要将参数进行重拍，使算子性能达到最优，如awq方法。
-        """
-        weight = self._process_weight_after_loading(weight.cuda(get_current_device_id()))
-        weight_scale = self._process_weight_scale_after_loading(
-            weight_scale.cuda(get_current_device_id()).to(dtype_type)
-        )
-        weight_zero_point = self._process_weight_zero_point_after_loading(
-            weight_zero_point.cuda(get_current_device_id())
-        )
-        return weight, weight_scale, weight_zero_point
-
     def apply(
         self,
         input_tensor: torch.Tensor,
@@ -206,9 +194,10 @@ def apply(
             out.add_(bias)
         return out
 
-    def create_weight(
-        self, out_dim: int, in_dim: int, dtype: torch.dtype, device_id: int, num_experts: int = 1
-    ) -> WeightPack:
+    def _create_weight(
+        self, out_dims: List[int], in_dim: int, dtype: torch.dtype, device_id: int, num_experts: int = 1
+    ) -> Tuple[WeightPack, List[WeightPack]]:
+        out_dim = sum(out_dims)
         self.n = out_dim
         self.k = in_dim
         group_size = self.hf_quantization_config["group_size"]
@@ -220,7 +209,9 @@ def create_weight(
         weight_zero_point = torch.empty(
             expert_prefix + (in_dim // group_size, out_dim // self.pack_factor), dtype=torch.int32
         ).cuda(device_id)
-        return WeightPack(weight=weight, weight_scale=weight_scale, weight_zero_point=weight_zero_point, fused_dim=1)
+        mm_param = WeightPack(weight=weight, weight_scale=weight_scale, weight_zero_point=weight_zero_point)
+        mm_param_list = self._split_weight_pack(mm_param, out_dims, weight_split_dim=-1, weight_scale_split_dim=-1)
+        return mm_param, mm_param_list
 
     def load_weight(self, weight: torch.Tensor, weight_pack: WeightPack) -> None:
         assert self.hf_quantization_config is not None, "hf_quantization_config is not set"
diff --git a/lightllm/common/quantization/deepgemm.py b/lightllm/common/quantization/deepgemm.py
index e24898169..df6195a21 100644
--- a/lightllm/common/quantization/deepgemm.py
+++ b/lightllm/common/quantization/deepgemm.py
@@ -1,5 +1,5 @@
 import torch
-from typing import Optional
+from typing import Optional, List, Union
 
 from lightllm.common.quantization.quantize_method import QuantizationMethod, WeightPack
 from lightllm.common.quantization.registry import QUANTMETHODS
@@ -97,9 +97,10 @@ def apply(
         _deepgemm_fp8_nt((qinput_tensor, input_scale), (qweight, weight_scale), out)
         return out
 
-    def create_weight(
-        self, out_dim: int, in_dim: int, dtype: torch.dtype, device_id: int, num_experts: int = 1
+    def _create_weight(
+        self, out_dims: Union[int, List[int]], in_dim: int, dtype: torch.dtype, device_id: int, num_experts: int = 1
     ) -> WeightPack:
+        out_dim = sum(out_dims) if isinstance(out_dims, list) else out_dims
         expert_prefix = (num_experts,) if num_experts > 1 else ()
         weight = torch.empty(expert_prefix + (out_dim, in_dim), dtype=torch.float8_e4m3fn).cuda(device_id)
         scale_out_dim = (out_dim + self.block_size - 1) // self.block_size
diff --git a/lightllm/common/quantization/no_quant.py b/lightllm/common/quantization/no_quant.py
index e68d9ffa7..35ce07c53 100644
--- a/lightllm/common/quantization/no_quant.py
+++ b/lightllm/common/quantization/no_quant.py
@@ -1,5 +1,5 @@
 import torch
-from typing import Optional
+from typing import Optional, List, Union, Tuple
 
 from lightllm.common.quantization.quantize_method import QuantizationMethod, WeightPack
 from lightllm.common.quantization.registry import QUANTMETHODS
@@ -34,12 +34,16 @@ def apply(
             return torch.mm(input_tensor, weight, out=out)
         return torch.addmm(bias, input_tensor, weight, out=out)
 
-    def create_weight(
-        self, out_dim: int, in_dim: int, dtype: torch.dtype, device_id: int, num_experts: int = 1
-    ) -> WeightPack:
+    def _create_weight(
+        self, out_dims: Union[int, List[int]], in_dim: int, dtype: torch.dtype, device_id: int, num_experts: int = 1
+    ) -> Tuple[WeightPack, List[WeightPack]]:
+        out_dim = sum(out_dims) if isinstance(out_dims, list) else out_dims
         expert_prefix = (num_experts,) if num_experts > 1 else ()
         weight = torch.empty(expert_prefix + (out_dim, in_dim), dtype=dtype).cuda(device_id)
-        return WeightPack(weight=weight, weight_scale=None, weight_zero_point=None)
+        mm_param = WeightPack(weight=weight, weight_scale=None, weight_zero_point=None)
+        # weight layout is (out_dim, in_dim), so the split dimension is -2.
+        mm_param_list = self._split_weight_pack(mm_param, out_dims, weight_split_dim=-2)
+        return mm_param, mm_param_list
 
     def weight_need_quanted(self, weight: torch.Tensor) -> bool:
         return False
diff --git a/lightllm/common/quantization/quantize_method.py b/lightllm/common/quantization/quantize_method.py
index 296173e5a..b596780c3 100644
--- a/lightllm/common/quantization/quantize_method.py
+++ b/lightllm/common/quantization/quantize_method.py
@@ -2,7 +2,7 @@
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
 from lightllm.utils.dist_utils import get_current_device_id
-from typing import Optional, Tuple
+from typing import Optional, List, Tuple
 
 
 @dataclass
@@ -10,46 +10,16 @@ class WeightPack:
     weight: Optional[torch.Tensor] = None
     weight_scale: Optional[torch.Tensor] = None
     weight_zero_point: Optional[torch.Tensor] = None
-    fused_dim: Optional[int] = 0
+
+    def __post_init__(self):
+        self.load_ok = [False, self.weight_scale is None, self.weight_zero_point is None]
 
     def get_expert(self, expert_idx: int):
         assert self.weight.ndim == 3, f"weight must be a 3D tensor, but got {self.weight.ndim}"
         weight = self.weight[expert_idx]
         weight_scale = self.weight_scale[expert_idx] if self.weight_scale is not None else None
         weight_zero_point = self.weight_zero_point[expert_idx] if self.weight_zero_point is not None else None
-        return WeightPack(
-            weight=weight, weight_scale=weight_scale, weight_zero_point=weight_zero_point, fused_dim=self.fused_dim
-        )
-
-    def create_cpu_buffer(self, weight_num: int):
-        self.weight_cpu_buffer = [None] * weight_num
-        self.weight_scale_cpu_buffer = [None] * weight_num
-        self.weight_zero_point_cpu_buffer = [None] * weight_num
-        self.load_ok = [False, self.weight_scale is None, self.weight_zero_point is None]
-        return
-
-    def get_fused_weight_part(self, weight_type) -> Optional[torch.Tensor]:
-        buffer_map = {
-            "weight": ("weight_cpu_buffer", 0),
-            "weight_scale": ("weight_scale_cpu_buffer", 1),
-            "weight_zero_point": ("weight_zero_point_cpu_buffer", 2),
-        }
-        buffer_name, index = buffer_map.get(weight_type)
-        if buffer_name is None:
-            raise ValueError(f"unknown weight type: {weight_type}")
-        cpu_buffer = getattr(self, buffer_name)
-        if None not in cpu_buffer:
-            try:
-                fused = torch.cat(cpu_buffer, dim=self.fused_dim)
-            except Exception as e:
-                print(len(cpu_buffer), self.fused_dim)
-                for buff in cpu_buffer:
-                    print(buff.shape)
-                raise e
-            setattr(self, buffer_name, [None] * len(cpu_buffer))
-            self.load_ok[index] = True
-            return fused
-        return None
+        return WeightPack(weight=weight, weight_scale=weight_scale, weight_zero_point=weight_zero_point)
 
 
 class QuantizationMethod(ABC):
@@ -64,6 +34,7 @@ def __init__(self):
         self.has_weight_zero_point: bool = None
         self.group_size: int = -1  # -1表示不分组即per-channel量化，其他表示分组大小
         self.pack_factor: int = 1
+        self.block_size: int = 1
 
         # 一些量化模式需要用到的额外量化参数，如awq量化
         self.hf_quantization_config = None
@@ -94,9 +65,25 @@ def method_name(self):
         pass
 
     def create_weight(
-        self, out_dim: int, in_dim: int, dtype: torch.dtype, device_id: int, num_experts: int = 1
-    ) -> WeightPack:
-        pass
+        self, out_dims: List[int], in_dim: int, dtype: torch.dtype, device_id: int
+    ) -> Tuple[WeightPack, List[WeightPack]]:
+        return self._create_weight(
+            out_dims=out_dims,
+            in_dim=in_dim,
+            dtype=dtype,
+            device_id=device_id,
+        )
+
+    def create_moe_weight(
+        self, out_dims: List[int], in_dim: int, dtype: torch.dtype, device_id: int, num_experts: int
+    ) -> Tuple[WeightPack, List[WeightPack]]:
+        return self._create_weight(
+            out_dims=out_dims,
+            in_dim=in_dim,
+            dtype=dtype,
+            device_id=device_id,
+            num_experts=num_experts,
+        )
 
     def weight_need_quanted(self, weight: torch.Tensor) -> bool:
         if weight is None:
@@ -121,3 +108,37 @@ def load_weight_zero_point(self, weight_zero_point: torch.Tensor, weight_pack: W
             return
         weight_pack.weight_zero_point.copy_(weight_zero_point)
         return
+
+    def _create_weight(
+        self, out_dims: List[int], in_dim: int, dtype: torch.dtype, device_id: int, num_experts: int = 1
+    ) -> Tuple[WeightPack, List[WeightPack]]:
+        pass
+
+    def _split_weight_pack(
+        self,
+        weight_pack: WeightPack,
+        out_dims: List[int],
+        weight_split_dim: Optional[int],
+        weight_scale_split_dim: Optional[int] = None,
+    ) -> List[WeightPack]:
+        # only support per-channel or block-wise quantization for now.
+        mm_param_list: List[WeightPack] = []
+        packed_out_dims = [dim // self.pack_factor for dim in out_dims]
+        scale_out_dims = [dim // self.block_size for dim in out_dims]
+        weight = torch.split(weight_pack.weight, packed_out_dims, dim=weight_split_dim)
+        weight_scale = (
+            [None] * len(out_dims)
+            if weight_pack.weight_scale is None
+            else (torch.split(weight_pack.weight_scale, scale_out_dims, dim=weight_scale_split_dim))
+        )
+        # the ndim of weight_zero_point is the same as weight_scale.
+        weight_zero_point = (
+            [None] * len(out_dims)
+            if weight_pack.weight_zero_point is None
+            else (torch.split(weight_pack.weight_zero_point, packed_out_dims, dim=weight_scale_split_dim))
+        )
+        for weight, weight_scale, weight_zero_point in zip(weight, weight_scale, weight_zero_point):
+            mm_param_list.append(
+                WeightPack(weight=weight, weight_scale=weight_scale, weight_zero_point=weight_zero_point)
+            )
+        return mm_param_list
diff --git a/lightllm/common/quantization/w8a8.py b/lightllm/common/quantization/w8a8.py
index 5bb7243d6..9fbd9b570 100644
--- a/lightllm/common/quantization/w8a8.py
+++ b/lightllm/common/quantization/w8a8.py
@@ -1,7 +1,7 @@
 import os
 import torch
 import torch.nn.functional as F
-from typing import Optional
+from typing import Optional, List, Union, Tuple
 from .quantize_method import QuantizationMethod
 from .registry import QUANTMETHODS
 from lightllm.common.basemodel.triton_kernel.quantization.scaled_mm_per_token_kernel import fp8_scaled_mm_per_token
@@ -55,9 +55,9 @@ def apply(
     def method_name(self):
         return "w8a8-base"
 
-    def create_weight(
-        self, out_dim: int, in_dim: int, dtype: torch.dtype, device_id: int, num_experts: int = 1
-    ) -> WeightPack:
+    def _create_weight(
+        self, out_dims: Union[int, List[int]], in_dim: int, dtype: torch.dtype, device_id: int, num_experts: int = 1
+    ) -> Tuple[WeightPack, List[WeightPack]]:
         raise NotImplementedError("Not implemented")
 
 
@@ -105,26 +105,27 @@ def apply(
     def method_name(self):
         return "vllm-w8a8"
 
-    def create_weight(
-        self, out_dim: int, in_dim: int, dtype: torch.dtype, device_id: int, num_experts: int = 1
-    ) -> WeightPack:
+    def _create_weight(
+        self, out_dims: Union[int, List[int]], in_dim: int, dtype: torch.dtype, device_id: int, num_experts: int = 1
+    ) -> Tuple[WeightPack, List[WeightPack]]:
+        out_dim = sum(out_dims) if isinstance(out_dims, list) else out_dims
         expert_prefix = (num_experts,) if num_experts > 1 else ()
         weight = torch.empty(expert_prefix + (out_dim, in_dim), dtype=torch.int8).cuda(device_id)
         weight_scale = torch.empty(expert_prefix + (out_dim,), dtype=torch.float32).cuda(device_id)
-        return WeightPack(weight=weight, weight_scale=weight_scale)
+        mm_param = WeightPack(weight=weight, weight_scale=weight_scale)
+        mm_param_list = self._split_weight_pack(mm_param, out_dims, weight_split_dim=-2, weight_scale_split_dim=-1)
+        return mm_param, mm_param_list
 
 
 @QUANTMETHODS.register(["vllm-fp8w8a8", "fp8w8a8"], platform="cuda")
 class FP8w8a8QuantizationMethod(BaseQuantizationMethod):
     def __init__(self):
         super().__init__()
-        self.is_moe = False
         self.has_weight_scale = True
         self.has_weight_zero_point = False
 
     def quantize(self, weight: torch.Tensor, output: WeightPack) -> None:
-        if self.is_moe:
-            return self.quantize_moe(weight, output)
+
         qweight, weight_scale = scaled_fp8_quant(
             weight.cuda(self.device_id_), scale=None, use_per_token_if_dynamic=True
         )
@@ -132,21 +133,6 @@ def quantize(self, weight: torch.Tensor, output: WeightPack) -> None:
         output.weight_scale.copy_(weight_scale.view(-1))
         return
 
-    def quantize_moe(self, weight: torch.Tensor, output: WeightPack) -> WeightPack:
-        num_experts = weight.shape[0]
-        qweights = torch.empty_like(weight, dtype=torch.float8_e4m3fn).cuda(self.device_id_)
-        weight_scales = []
-        for i in range(num_experts):
-            qweight, weight_scale = scaled_fp8_quant(
-                weight[i].contiguous().cuda(self.device_id_), scale=None, use_per_token_if_dynamic=True
-            )
-            qweights[i] = qweight
-            weight_scales.append(weight_scale)
-        weight_scale = torch.stack(weight_scales, dim=0).contiguous()
-        output.weight.copy_(qweights)
-        output.weight_scale.copy_(weight_scale.view(-1))
-        return
-
     def apply(
         self,
         input_tensor: torch.Tensor,
@@ -176,13 +162,16 @@ def apply(
     def method_name(self):
         return "vllm-fp8w8a8"
 
-    def create_weight(
-        self, out_dim: int, in_dim: int, dtype: torch.dtype, device_id: int, num_experts: int = 1
-    ) -> WeightPack:
+    def _create_weight(
+        self, out_dims: Union[int, List[int]], in_dim: int, dtype: torch.dtype, device_id: int, num_experts: int = 1
+    ) -> Tuple[WeightPack, List[WeightPack]]:
+        out_dim = sum(out_dims) if isinstance(out_dims, list) else out_dims
         expert_prefix = (num_experts,) if num_experts > 1 else ()
         weight = torch.empty(expert_prefix + (out_dim, in_dim), dtype=torch.float8_e4m3fn).cuda(device_id)
         weight_scale = torch.empty(expert_prefix + (out_dim,), dtype=torch.float32).cuda(device_id)
-        return WeightPack(weight=weight, weight_scale=weight_scale)
+        mm_param = WeightPack(weight=weight, weight_scale=weight_scale)
+        mm_param_list = self._split_weight_pack(mm_param, out_dims, weight_split_dim=-2, weight_scale_split_dim=-1)
+        return mm_param, mm_param_list
 
 
 @QUANTMETHODS.register(["vllm-fp8w8a8-b128", "fp8w8a8-b128"], platform="cuda")
@@ -243,12 +232,15 @@ def apply(
     def method_name(self):
         return "vllm-fp8w8a8-b128"
 
-    def create_weight(
-        self, out_dim: int, in_dim: int, dtype: torch.dtype, device_id: int, num_experts: int = 1
-    ) -> WeightPack:
+    def _create_weight(
+        self, out_dims: Union[int, List[int]], in_dim: int, dtype: torch.dtype, device_id: int, num_experts: int = 1
+    ) -> Tuple[WeightPack, List[WeightPack]]:
+        out_dim = sum(out_dims) if isinstance(out_dims, list) else out_dims
         expert_prefix = (num_experts,) if num_experts > 1 else ()
         weight = torch.empty(expert_prefix + (out_dim, in_dim), dtype=torch.float8_e4m3fn).cuda(device_id)
         weight_scale = torch.empty(
             expert_prefix + (out_dim // self.block_size, in_dim // self.block_size), dtype=torch.float32
         ).cuda(device_id)
-        return WeightPack(weight=weight, weight_scale=weight_scale)
+        mm_param = WeightPack(weight=weight, weight_scale=weight_scale)
+        mm_param_list = self._split_weight_pack(mm_param, out_dims, weight_split_dim=-2, weight_scale_split_dim=-2)
+        return mm_param, mm_param_list

From aec881fdba26fdecea2deed47724355de8803dd7 Mon Sep 17 00:00:00 2001
From: shihaobai <1798930569@qq.com>
Date: Mon, 26 Jan 2026 16:35:24 +0000
Subject: [PATCH 40/65] per-channel weight moe

---
 .../basemodel/triton_kernel/fused_moe/grouped_fused_moe.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/lightllm/common/basemodel/triton_kernel/fused_moe/grouped_fused_moe.py b/lightllm/common/basemodel/triton_kernel/fused_moe/grouped_fused_moe.py
index f29d3a2a0..ba9197261 100644
--- a/lightllm/common/basemodel/triton_kernel/fused_moe/grouped_fused_moe.py
+++ b/lightllm/common/basemodel/triton_kernel/fused_moe/grouped_fused_moe.py
@@ -388,7 +388,7 @@ def grouped_matmul_kernel(
     n,  # int
     topk_num,  # int
     token_scale_ptr,  # [1,] for per tensor quant, or [token_num, hidden_dim // block_size] for per token, group quant
-    weight_scale_ptr,  # [expert_num,] or [export_num, n // block_size_n, k // block_size_k]
+    weight_scale_ptr,  # [expert_num, n] or [export_num, n // block_size_n, k // block_size_k]
     weight_scale_stride0,
     weight_scale_stride1,
     weight_scale_stride2,
@@ -498,7 +498,10 @@ def grouped_matmul_kernel(
             b_scale_ptrs = weight_scale_ptr + expert_id * weight_scale_stride0 + offs_bsn * weight_scale_stride1
         else:
             a_scale = tl.load(token_scale_ptr, eviction_policy="evict_last")
-            b_scale = tl.load(weight_scale_ptr + expert_id, eviction_policy="evict_last")
+            b_scale = tl.load(
+                weight_scale_ptr + expert_id * weight_scale_stride0 + offs_bn[None, :] * weight_scale_stride1,
+                eviction_policy="evict_last",
+            )
             ab_scale = a_scale * b_scale
 
     if NEED_TRANS:

From 9d1073ac39ec5a5e2197ae08d88ec2b8326e857e Mon Sep 17 00:00:00 2001
From: shihaobai <1798930569@qq.com>
Date: Mon, 26 Jan 2026 16:35:46 +0000
Subject: [PATCH 41/65] moe weight buffer remove

---
 .../fused_moe/fused_moe_weight.py             | 32 +++++++--------
 .../meta_weights/mm_weight/mm_weight.py       | 21 ++++------
 lightllm/common/quantization/awq.py           | 29 +++++++++++++-
 lightllm/common/quantization/deepgemm.py      | 15 +++++--
 lightllm/common/quantization/no_quant.py      | 18 ++++-----
 .../common/quantization/quantize_method.py    | 39 +++++++++++--------
 lightllm/common/quantization/w8a8.py          | 26 +++++++++++--
 7 files changed, 115 insertions(+), 65 deletions(-)

diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/fused_moe_weight.py b/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/fused_moe_weight.py
index e945be587..926ea30a6 100644
--- a/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/fused_moe_weight.py
+++ b/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/fused_moe_weight.py
@@ -266,7 +266,7 @@ def load_hf_weights(self, weights):
             self._load_weight(self.redundancy_expert_idx_to_local_idx, weights)
 
     def verify_load(self):
-        return True
+        return all(all(_weight_pack.load_ok) for _weight_pack in self.w1_list + self.w2_list + self.w3_list)
 
     def _create_weight(self):
         intermediate_size = self.split_inter_size
@@ -333,11 +333,11 @@ def _load_expert(
         row_slice_func = self.row_slicer._slice_weight
         col_slice_func = self.col_slicer._slice_weight
         if w1_weight in weights:
-            self._load_weight_func(row_slice_func(weights[w1_weight]), self.w1_list[local_expert_idx])
+            self.quant_method.load_weight(row_slice_func(weights[w1_weight]), self.w1_list[local_expert_idx])
         if w3_weight in weights:
-            self._load_weight_func(row_slice_func(weights[w3_weight]), self.w3_list[local_expert_idx])
+            self.quant_method.load_weight(row_slice_func(weights[w3_weight]), self.w3_list[local_expert_idx])
         if w2_weight in weights:
-            self._load_weight_func(col_slice_func(weights[w2_weight]), self.w2_list[local_expert_idx])
+            self.quant_method.load_weight(col_slice_func(weights[w2_weight]), self.w2_list[local_expert_idx])
 
     def _load_expert_scale(
         self,
@@ -351,11 +351,11 @@ def _load_expert_scale(
         row_slice_func = self.row_slicer._slice_weight_scale
         col_slice_func = self.col_slicer._slice_weight_scale
         if w1_scale in weights:
-            self._load_weight_scale_func(row_slice_func(weights[w1_scale]), self.w1_list[local_expert_idx])
+            self.quant_method.load_weight_scale(row_slice_func(weights[w1_scale]), self.w1_list[local_expert_idx])
         if w3_scale in weights:
-            self._load_weight_scale_func(row_slice_func(weights[w3_scale]), self.w3_list[local_expert_idx])
+            self.quant_method.load_weight_scale(row_slice_func(weights[w3_scale]), self.w3_list[local_expert_idx])
         if w2_scale in weights:
-            self._load_weight_scale_func(col_slice_func(weights[w2_scale]), self.w2_list[local_expert_idx])
+            self.quant_method.load_weight_scale(col_slice_func(weights[w2_scale]), self.w2_list[local_expert_idx])
 
     def _load_expert_zero_point(
         self,
@@ -375,14 +375,14 @@ def _load_expert_zero_point(
         row_slice_func = self.row_slicer._slice_weight_zero_point
         col_slice_func = self.col_slicer._slice_weight_zero_point
         if w1_zero_point in weights:
-            self._load_weight_zero_point_func(row_slice_func(weights[w1_zero_point]), self.w1_list[local_expert_idx])
+            self.quant_method.load_weight_zero_point(
+                row_slice_func(weights[w1_zero_point]), self.w1_list[local_expert_idx]
+            )
         if w3_zero_point in weights:
-            self._load_weight_zero_point_func(row_slice_func(weights[w3_zero_point]), self.w3_list[local_expert_idx])
+            self.quant_method.load_weight_zero_point(
+                row_slice_func(weights[w3_zero_point]), self.w3_list[local_expert_idx]
+            )
         if w2_zero_point in weights:
-            self._load_weight_zero_point_func(col_slice_func(weights[w2_zero_point]), self.w2_list[local_expert_idx])
-
-    def _load_weight_func(self, weight: torch.Tensor, weight_pack: WeightPack):
-        if self.quant_method.weight_need_quanted(weight):
-            self.quant_method.quantize(weight, weight_pack)
-        else:
-            self.quant_method.load_weight(weight, weight_pack)
+            self.quant_method.load_weight_zero_point(
+                col_slice_func(weights[w2_zero_point]), self.w2_list[local_expert_idx]
+            )
diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/mm_weight.py b/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/mm_weight.py
index 912718925..5ca241d2c 100644
--- a/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/mm_weight.py
+++ b/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/mm_weight.py
@@ -116,7 +116,7 @@ def _create_weight(self):
             self.bias = torch.empty(sum(self.out_dims), dtype=self.data_type_).cuda(get_current_device_id())
             # bias_list shares storage with bias for each output shard
             self.bias_list = torch.split(self.bias, self.out_dims, dim=0)
-            self.bias._load_ok = [False] * len(self.bias_names)
+            self.bias.load_ok = [False] * len(self.bias_names)
         self.mm_param: WeightPack = None
         self.mm_param_list: List[WeightPack] = None
         self.mm_param, self.mm_param_list = self.quant_method.create_weight(
@@ -130,14 +130,7 @@ def _load_weight(
     ) -> None:
         if param_name in weights:
             weight = self.param_slicer._slice_weight(weights[param_name])
-            if self.quant_method.weight_need_quanted(weight):
-                self.quant_method.quantize(weight, self.mm_param_list[sub_child_index])
-                # online quantization, so we need to set the load_ok for weight_scale and weight_zero_point.
-                self.mm_param_list[sub_child_index].load_ok[1] = True
-                self.mm_param_list[sub_child_index].load_ok[2] = True
-            else:
-                self.quant_method.load_weight(weight, self.mm_param_list[sub_child_index])
-            self.mm_param_list[sub_child_index].load_ok[0] = True
+            self.quant_method.load_weight(weight, self.mm_param_list[sub_child_index])
         return
 
     def _load_bias(
@@ -146,7 +139,7 @@ def _load_bias(
         if param_name in weights:
             bias = self.param_slicer._slice_bias(weights[param_name])
             self.bias_list[sub_child_index].copy_(bias)
-            self.bias_list[sub_child_index]._load_ok = True
+            self.bias_list[sub_child_index].load_ok = True
         return
 
     def _load_weight_scale(
@@ -167,7 +160,7 @@ def _load_weight_zero_point(
 
     def verify_load(self):
         mm_param_load_ok = all(all(_mm_param.load_ok) for _mm_param in self.mm_param_list)
-        bias_load_ok = True if self.bias is None else all(self.bias._load_ok)
+        bias_load_ok = True if self.bias is None else all(self.bias.load_ok)
         if not (mm_param_load_ok and bias_load_ok):
             logger.warning(f"mm_param_load_ok: {self.mm_param_list[0].load_ok}")
         return mm_param_load_ok and bias_load_ok
@@ -210,7 +203,7 @@ def __init__(
 
     def _create_weight(self):
         self.weight = torch.empty(self.dim0, self.dim1, self.dim2, dtype=self.data_type_).cuda(get_current_device_id())
-        self.weight._load_ok = False
+        self.weight.load_ok = False
         return
 
     def load_hf_weights(self, weights: Dict[str, torch.Tensor]):
@@ -218,11 +211,11 @@ def load_hf_weights(self, weights: Dict[str, torch.Tensor]):
             if weight_name in weights:
                 weight = self.param_slicer._slice_weight(weights[weight_name])
                 self.weight.copy_(weight)
-                self.weight._load_ok = True
+                self.weight.load_ok = True
         return
 
     def verify_load(self):
-        return self.weight._load_ok
+        return self.weight.load_ok
 
     def bmm(
         self, input_tensor: torch.Tensor, out: Optional[torch.Tensor] = None, use_custom_tensor_mananger: bool = True
diff --git a/lightllm/common/quantization/awq.py b/lightllm/common/quantization/awq.py
index e5d0b4519..41a4e7685 100644
--- a/lightllm/common/quantization/awq.py
+++ b/lightllm/common/quantization/awq.py
@@ -119,8 +119,19 @@ def _create_weight(
         weight_zero_point = torch.empty(
             expert_prefix + (in_dim // group_size, out_dim // self.pack_factor), dtype=torch.int32
         ).cuda(device_id)
+        weight_out_dims = [_out_dim // self.pack_factor for _out_dim in out_dims]
+        weight_scale_out_dims = out_dims
+        weight_zero_point_out_dims = weight_out_dims
         mm_param = WeightPack(weight=weight, weight_scale=weight_scale, weight_zero_point=weight_zero_point)
-        mm_param_list = self._split_weight_pack(mm_param, out_dims, weight_split_dim=-1, weight_scale_split_dim=-1)
+        mm_param_list = self._split_weight_pack(
+            mm_param,
+            weight_out_dims=weight_out_dims,
+            weight_split_dim=-1,
+            weight_scale_out_dims=weight_scale_out_dims,
+            weight_scale_split_dim=-1,
+            weight_zero_point_out_dims=weight_zero_point_out_dims,
+            weight_zero_point_split_dim=-1,
+        )
         return mm_param, mm_param_list
 
 
@@ -209,8 +220,19 @@ def _create_weight(
         weight_zero_point = torch.empty(
             expert_prefix + (in_dim // group_size, out_dim // self.pack_factor), dtype=torch.int32
         ).cuda(device_id)
+        weight_out_dims = [_out_dim * self.tile_size // self.pack_factor for _out_dim in out_dims]
+        weight_scale_out_dims = out_dims
+        weight_zero_point_out_dims = [_out_dim // self.pack_factor for _out_dim in out_dims]
         mm_param = WeightPack(weight=weight, weight_scale=weight_scale, weight_zero_point=weight_zero_point)
-        mm_param_list = self._split_weight_pack(mm_param, out_dims, weight_split_dim=-1, weight_scale_split_dim=-1)
+        mm_param_list = self._split_weight_pack(
+            mm_param,
+            weight_out_dims=weight_out_dims,
+            weight_split_dim=-1,
+            weight_scale_out_dims=weight_scale_out_dims,
+            weight_scale_split_dim=-1,
+            weight_zero_point_out_dims=weight_zero_point_out_dims,
+            weight_zero_point_split_dim=-1,
+        )
         return mm_param, mm_param_list
 
     def load_weight(self, weight: torch.Tensor, weight_pack: WeightPack) -> None:
@@ -225,6 +247,7 @@ def load_weight(self, weight: torch.Tensor, weight_pack: WeightPack) -> None:
             num_bits=self.hf_quantization_config["bits"],
         )
         weight_pack.weight.copy_(repack_weight)
+        weight_pack.load_ok[0] = True
         return
 
     def load_weight_scale(self, weight_scale: torch.Tensor, weight_pack: WeightPack) -> None:
@@ -240,6 +263,7 @@ def load_weight_scale(self, weight_scale: torch.Tensor, weight_pack: WeightPack)
             group_size=self.hf_quantization_config["group_size"],
         )
         weight_pack.weight_scale.copy_(repack_weight_scale)
+        weight_pack.load_ok[1] = True
         return
 
     def load_weight_zero_point(self, weight_zero_point: torch.Tensor, weight_pack: WeightPack) -> None:
@@ -253,6 +277,7 @@ def load_weight_zero_point(self, weight_zero_point: torch.Tensor, weight_pack: W
             num_bits=self.hf_quantization_config["bits"],
         )
         weight_pack.weight_zero_point.copy_(repack_weight_zero_point)
+        weight_pack.load_ok[2] = True
         return
 
 
diff --git a/lightllm/common/quantization/deepgemm.py b/lightllm/common/quantization/deepgemm.py
index df6195a21..7193b6b2e 100644
--- a/lightllm/common/quantization/deepgemm.py
+++ b/lightllm/common/quantization/deepgemm.py
@@ -1,5 +1,5 @@
 import torch
-from typing import Optional, List, Union
+from typing import Optional, List, Union, Tuple
 
 from lightllm.common.quantization.quantize_method import QuantizationMethod, WeightPack
 from lightllm.common.quantization.registry import QUANTMETHODS
@@ -99,14 +99,23 @@ def apply(
 
     def _create_weight(
         self, out_dims: Union[int, List[int]], in_dim: int, dtype: torch.dtype, device_id: int, num_experts: int = 1
-    ) -> WeightPack:
+    ) -> Tuple[WeightPack, List[WeightPack]]:
         out_dim = sum(out_dims) if isinstance(out_dims, list) else out_dims
         expert_prefix = (num_experts,) if num_experts > 1 else ()
         weight = torch.empty(expert_prefix + (out_dim, in_dim), dtype=torch.float8_e4m3fn).cuda(device_id)
         scale_out_dim = (out_dim + self.block_size - 1) // self.block_size
         scale_in_dim = (in_dim + self.block_size - 1) // self.block_size
         weight_scale = torch.empty(expert_prefix + (scale_out_dim, scale_in_dim), dtype=torch.float32).cuda(device_id)
-        return WeightPack(weight=weight, weight_scale=weight_scale)
+        mm_param = WeightPack(weight=weight, weight_scale=weight_scale)
+        weight_scale_out_dims = [_out_dim // self.block_size for _out_dim in out_dims]
+        mm_param_list = self._split_weight_pack(
+            mm_param,
+            weight_out_dims=out_dims,
+            weight_split_dim=-2,
+            weight_scale_out_dims=weight_scale_out_dims,
+            weight_scale_split_dim=-2,
+        )
+        return mm_param, mm_param_list
 
 
 def _deepgemm_fp8_nt(a_tuple, b_tuple, out):
diff --git a/lightllm/common/quantization/no_quant.py b/lightllm/common/quantization/no_quant.py
index 35ce07c53..3bf023f8a 100644
--- a/lightllm/common/quantization/no_quant.py
+++ b/lightllm/common/quantization/no_quant.py
@@ -28,8 +28,8 @@ def apply(
             device = input_tensor.device
             if use_custom_tensor_mananger:
                 out = g_cache_manager.alloc_tensor(shape, dtype, device=device)
-            else:
-                out = torch.empty(shape, dtype=dtype, device=device)
+        else:
+            out = torch.empty(shape, dtype=dtype, device=device)
         if bias is None:
             return torch.mm(input_tensor, weight, out=out)
         return torch.addmm(bias, input_tensor, weight, out=out)
@@ -42,10 +42,14 @@ def _create_weight(
         weight = torch.empty(expert_prefix + (out_dim, in_dim), dtype=dtype).cuda(device_id)
         mm_param = WeightPack(weight=weight, weight_scale=None, weight_zero_point=None)
         # weight layout is (out_dim, in_dim), so the split dimension is -2.
-        mm_param_list = self._split_weight_pack(mm_param, out_dims, weight_split_dim=-2)
+        mm_param_list = self._split_weight_pack(
+            mm_param,
+            weight_out_dims=out_dims,
+            weight_split_dim=-2,
+        )
         return mm_param, mm_param_list
 
-    def weight_need_quanted(self, weight: torch.Tensor) -> bool:
+    def _check_weight_need_quanted(self, weight: torch.Tensor) -> bool:
         return False
 
     def quantize(self, weight: torch.Tensor, output: WeightPack, offset: int = 0) -> None:
@@ -54,9 +58,3 @@ def quantize(self, weight: torch.Tensor, output: WeightPack, offset: int = 0) ->
     @property
     def method_name(self):
         return "none"
-
-    def load_weight(self, weight: torch.Tensor, weight_pack: WeightPack, start_idx: int = 0) -> None:
-        if weight is None:
-            return
-        weight_pack.weight.copy_(weight)
-        return
diff --git a/lightllm/common/quantization/quantize_method.py b/lightllm/common/quantization/quantize_method.py
index b596780c3..95d8d806f 100644
--- a/lightllm/common/quantization/quantize_method.py
+++ b/lightllm/common/quantization/quantize_method.py
@@ -34,7 +34,6 @@ def __init__(self):
         self.has_weight_zero_point: bool = None
         self.group_size: int = -1  # -1表示不分组即per-channel量化，其他表示分组大小
         self.pack_factor: int = 1
-        self.block_size: int = 1
 
         # 一些量化模式需要用到的额外量化参数，如awq量化
         self.hf_quantization_config = None
@@ -85,30 +84,33 @@ def create_moe_weight(
             num_experts=num_experts,
         )
 
-    def weight_need_quanted(self, weight: torch.Tensor) -> bool:
-        if weight is None:
-            return False
-        # 判断一个 weight 是否需要进行量化操作。
-        return weight.dtype in [torch.bfloat16, torch.float16, torch.float32, torch.float64]
-
     def load_weight(self, weight: torch.Tensor, weight_pack: WeightPack) -> None:
-        if weight is None:
+        if self._check_weight_need_quanted(weight):
+            self.quantize(weight, weight_pack)
+            weight_pack.load_ok = [True, True, True]
             return
-        weight_pack.weight.copy_(weight)
+        weight_pack.weight[:].copy_(weight)
+        weight_pack.load_ok[0] = True
         return
 
     def load_weight_scale(self, weight_scale: torch.Tensor, weight_pack: WeightPack) -> None:
         if weight_scale is None:
             return
         weight_pack.weight_scale.copy_(weight_scale)
+        weight_pack.load_ok[1] = True
         return
 
     def load_weight_zero_point(self, weight_zero_point: torch.Tensor, weight_pack: WeightPack) -> None:
         if weight_zero_point is None:
             return
         weight_pack.weight_zero_point.copy_(weight_zero_point)
+        weight_pack.load_ok[2] = True
         return
 
+    def _check_weight_need_quanted(self, weight: torch.Tensor) -> bool:
+        # 判断一个 weight 是否需要进行量化操作。
+        return weight.dtype in [torch.bfloat16, torch.float16, torch.float32, torch.float64]
+
     def _create_weight(
         self, out_dims: List[int], in_dim: int, dtype: torch.dtype, device_id: int, num_experts: int = 1
     ) -> Tuple[WeightPack, List[WeightPack]]:
@@ -117,25 +119,28 @@ def _create_weight(
     def _split_weight_pack(
         self,
         weight_pack: WeightPack,
-        out_dims: List[int],
+        weight_out_dims: List[int],
         weight_split_dim: Optional[int],
+        weight_scale_out_dims: List[int] = None,
         weight_scale_split_dim: Optional[int] = None,
+        weight_zero_point_out_dims: List[int] = None,
+        weight_zero_point_split_dim: Optional[int] = None,
     ) -> List[WeightPack]:
         # only support per-channel or block-wise quantization for now.
         mm_param_list: List[WeightPack] = []
-        packed_out_dims = [dim // self.pack_factor for dim in out_dims]
-        scale_out_dims = [dim // self.block_size for dim in out_dims]
-        weight = torch.split(weight_pack.weight, packed_out_dims, dim=weight_split_dim)
+        weight = torch.split(weight_pack.weight, weight_out_dims, dim=weight_split_dim)
         weight_scale = (
-            [None] * len(out_dims)
+            [None] * len(weight_out_dims)
             if weight_pack.weight_scale is None
-            else (torch.split(weight_pack.weight_scale, scale_out_dims, dim=weight_scale_split_dim))
+            else (torch.split(weight_pack.weight_scale, weight_scale_out_dims, dim=weight_scale_split_dim))
         )
         # the ndim of weight_zero_point is the same as weight_scale.
         weight_zero_point = (
-            [None] * len(out_dims)
+            [None] * len(weight_out_dims)
             if weight_pack.weight_zero_point is None
-            else (torch.split(weight_pack.weight_zero_point, packed_out_dims, dim=weight_scale_split_dim))
+            else (
+                torch.split(weight_pack.weight_zero_point, weight_zero_point_out_dims, dim=weight_zero_point_split_dim)
+            )
         )
         for weight, weight_scale, weight_zero_point in zip(weight, weight_scale, weight_zero_point):
             mm_param_list.append(
diff --git a/lightllm/common/quantization/w8a8.py b/lightllm/common/quantization/w8a8.py
index 9fbd9b570..db79b4e31 100644
--- a/lightllm/common/quantization/w8a8.py
+++ b/lightllm/common/quantization/w8a8.py
@@ -113,7 +113,13 @@ def _create_weight(
         weight = torch.empty(expert_prefix + (out_dim, in_dim), dtype=torch.int8).cuda(device_id)
         weight_scale = torch.empty(expert_prefix + (out_dim,), dtype=torch.float32).cuda(device_id)
         mm_param = WeightPack(weight=weight, weight_scale=weight_scale)
-        mm_param_list = self._split_weight_pack(mm_param, out_dims, weight_split_dim=-2, weight_scale_split_dim=-1)
+        mm_param_list = self._split_weight_pack(
+            mm_param,
+            weight_out_dims=out_dims,
+            weight_split_dim=-2,
+            weight_scale_out_dims=out_dims,
+            weight_scale_split_dim=-1,
+        )
         return mm_param, mm_param_list
 
 
@@ -170,7 +176,14 @@ def _create_weight(
         weight = torch.empty(expert_prefix + (out_dim, in_dim), dtype=torch.float8_e4m3fn).cuda(device_id)
         weight_scale = torch.empty(expert_prefix + (out_dim,), dtype=torch.float32).cuda(device_id)
         mm_param = WeightPack(weight=weight, weight_scale=weight_scale)
-        mm_param_list = self._split_weight_pack(mm_param, out_dims, weight_split_dim=-2, weight_scale_split_dim=-1)
+
+        mm_param_list = self._split_weight_pack(
+            mm_param,
+            weight_out_dims=out_dims,
+            weight_split_dim=-2,
+            weight_scale_out_dims=out_dims,
+            weight_scale_split_dim=-1,
+        )
         return mm_param, mm_param_list
 
 
@@ -242,5 +255,12 @@ def _create_weight(
             expert_prefix + (out_dim // self.block_size, in_dim // self.block_size), dtype=torch.float32
         ).cuda(device_id)
         mm_param = WeightPack(weight=weight, weight_scale=weight_scale)
-        mm_param_list = self._split_weight_pack(mm_param, out_dims, weight_split_dim=-2, weight_scale_split_dim=-2)
+        weight_scale_out_dims = [_out_dim // self.block_size for _out_dim in out_dims]
+        mm_param_list = self._split_weight_pack(
+            mm_param,
+            weight_out_dims=out_dims,
+            weight_split_dim=-2,
+            weight_scale_out_dims=weight_scale_out_dims,
+            weight_scale_split_dim=-2,
+        )
         return mm_param, mm_param_list

From ba98c62503f802bcead1acb06db7fced9d0b6bc0 Mon Sep 17 00:00:00 2001
From: shihaobai <1798930569@qq.com>
Date: Mon, 26 Jan 2026 16:41:27 +0000
Subject: [PATCH 42/65] fix deepgemm

---
 lightllm/common/quantization/deepgemm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lightllm/common/quantization/deepgemm.py b/lightllm/common/quantization/deepgemm.py
index 7193b6b2e..88877279c 100644
--- a/lightllm/common/quantization/deepgemm.py
+++ b/lightllm/common/quantization/deepgemm.py
@@ -107,7 +107,7 @@ def _create_weight(
         scale_in_dim = (in_dim + self.block_size - 1) // self.block_size
         weight_scale = torch.empty(expert_prefix + (scale_out_dim, scale_in_dim), dtype=torch.float32).cuda(device_id)
         mm_param = WeightPack(weight=weight, weight_scale=weight_scale)
-        weight_scale_out_dims = [_out_dim // self.block_size for _out_dim in out_dims]
+        weight_scale_out_dims = [(_out_dim + self.block_size - 1) // self.block_size for _out_dim in out_dims]
         mm_param_list = self._split_weight_pack(
             mm_param,
             weight_out_dims=out_dims,

From 2f9fa56aac548f47571d1e5ad2cadd8a947e74c7 Mon Sep 17 00:00:00 2001
From: shihaobai <1798930569@qq.com>
Date: Mon, 26 Jan 2026 17:29:53 +0000
Subject: [PATCH 43/65] fix internvl 26b

---
 .../meta_weights/mm_weight/mm_weight.py       |  5 +-
 .../layer_weights/meta_weights/norm_weight.py | 48 ++++++++++---------
 .../vit/layer_infer/post_layer_infer.py       | 20 ++------
 .../pre_and_post_layer_weight.py              | 44 +++++++----------
 .../layer_weights/transformer_layer_weight.py |  9 +++-
 5 files changed, 57 insertions(+), 69 deletions(-)

diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/mm_weight.py b/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/mm_weight.py
index 5ca241d2c..9f34d5dfa 100644
--- a/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/mm_weight.py
+++ b/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/mm_weight.py
@@ -116,7 +116,8 @@ def _create_weight(self):
             self.bias = torch.empty(sum(self.out_dims), dtype=self.data_type_).cuda(get_current_device_id())
             # bias_list shares storage with bias for each output shard
             self.bias_list = torch.split(self.bias, self.out_dims, dim=0)
-            self.bias.load_ok = [False] * len(self.bias_names)
+            for sub_bias in self.bias_list:
+                sub_bias.load_ok = False
         self.mm_param: WeightPack = None
         self.mm_param_list: List[WeightPack] = None
         self.mm_param, self.mm_param_list = self.quant_method.create_weight(
@@ -160,7 +161,7 @@ def _load_weight_zero_point(
 
     def verify_load(self):
         mm_param_load_ok = all(all(_mm_param.load_ok) for _mm_param in self.mm_param_list)
-        bias_load_ok = True if self.bias is None else all(self.bias.load_ok)
+        bias_load_ok = True if self.bias is None else all(sub_bias.load_ok for sub_bias in self.bias_list)
         if not (mm_param_load_ok and bias_load_ok):
             logger.warning(f"mm_param_load_ok: {self.mm_param_list[0].load_ok}")
         return mm_param_load_ok and bias_load_ok
diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/norm_weight.py b/lightllm/common/basemodel/layer_weights/meta_weights/norm_weight.py
index 0ce6ba2f1..931ee92a7 100644
--- a/lightllm/common/basemodel/layer_weights/meta_weights/norm_weight.py
+++ b/lightllm/common/basemodel/layer_weights/meta_weights/norm_weight.py
@@ -9,7 +9,7 @@
 
 
 class RMSNormWeight(BaseWeightTpl, PlatformAwareOp):
-    def __init__(self, dim: int, weight_name: str, data_type: torch.dtype):
+    def __init__(self, dim: int, weight_name: str, data_type: torch.dtype, **kwargs):
         super().__init__()
         self.dim = dim
         self.weight_name = weight_name
@@ -113,10 +113,12 @@ def _native_forward(
     def _triton_forward(
         self, input: torch.Tensor, eps: float, out: Optional[torch.Tensor] = None, alloc_func=torch.empty
     ) -> torch.Tensor:
-        assert input.ndim == 2 and self.weight.ndim == 1
+        # assert input.ndim == 2 and self.weight.ndim == 1
+        print(input.shape)
         if out is None:
             out = alloc_func(input.shape, dtype=input.dtype, device=input.device)
-        return layernorm_forward(x=input, weight=self.weight, bias=self.bias, eps=eps, out=out)
+        out[:] = layernorm_forward(x=input, weight=self.weight, bias=self.bias, eps=eps)
+        return out
 
     def _cuda_forward(
         self, input: torch.Tensor, eps: float, out: Optional[torch.Tensor] = None, alloc_func=torch.empty
@@ -137,34 +139,36 @@ def __call__(
 
 
 class TpRMSNormWeight(RMSNormWeight):
-    def __init__(self, dim: int, weight_name: str, data_type: torch.dtype):
+    def __init__(self, head_num, head_dim, weight_name: str, data_type: torch.dtype):
+        padded_head_num = self._get_tp_padded_head_num(head_num)
+        dim = padded_head_num * head_dim
         super().__init__(dim=dim, weight_name=weight_name, data_type=data_type)
-        self.tp_world_size_ = get_dp_world_size()
-        self.tp_rank_ = get_current_rank_in_dp()
-        self.dim = self._get_tp_padded_dim(dim=dim)
         self.repeat_times_ = 1
 
-    def _get_tp_padded_dim(self, dim: int):
+    def _get_tp_padded_head_num(self, head_num: int):
         """
         Get the padded dimension for the weight.
-        1. if dim is divisible by tp_world_size_, return dim
-        2. if dim is greater than tp_world_size_, return (dim + tp_world_size_ - 1) // tp_world_size_ * tp_world_size_
-        3. if dim is less than tp_world_size_, assert tp_world_size_ is divisible by dim, and return dim
+        1. If head_num is divisible by tp_world_size_, return head_num.
+        2. If head_num is greater than tp_world_size_, return:
+           (head_num + tp_world_size_ - 1) // tp_world_size_ * tp_world_size_
+        3. If head_num is less than tp_world_size_, assert tp_world_size_ is
+           divisible by head_num, and return head_num.
         """
-        if dim % self.tp_world_size_ == 0:
-            return dim // self.tp_world_size_
+        self.tp_world_size_ = get_dp_world_size()
+        if head_num % self.tp_world_size_ == 0:
+            return head_num // self.tp_world_size_
 
-        if dim > self.tp_world_size_:
-            return (dim + self.tp_world_size_ - 1) // self.tp_world_size_ * self.tp_world_size_
+        if head_num > self.tp_world_size_:
+            return (head_num + self.tp_world_size_ - 1) // self.tp_world_size_ * self.tp_world_size_
         else:
             assert (
-                self.tp_world_size_ % dim == 0
-            ), f"tp_world_size_ must be divisible by dim, but found: {self.tp_world_size_} % {dim}"
-            self.repeat_times_ = self.tp_world_size_ // dim
-            return dim * self.repeat_times_ // self.tp_world_size_
+                self.tp_world_size_ % head_num == 0
+            ), f"tp_world_size_ must be divisible by head_num, but found: {self.tp_world_size_} % {head_num}"
+            self.repeat_times_ = self.tp_world_size_ // head_num
+            return head_num * self.repeat_times_ // self.tp_world_size_
 
     def load_hf_weights(self, weights):
-        if self.weight_name in weights and self.weight is None:
+        if self.weight_name in weights:
             t_weight = weights[self.weight_name]
             hidden_size = t_weight.shape[0]
             split_hidden_size = hidden_size // self.tp_world_size_
@@ -172,9 +176,9 @@ def load_hf_weights(self, weights):
             start = split_hidden_size * self.tp_rank_ // self.repeat_times_
             end = min(split_hidden_size * (self.tp_rank_ + 1) // self.repeat_times_, hidden_size)
 
-            self.weight[:, end - start].copy_(t_weight[start:end].to(self.data_type_))
+            self.weight[: end - start].copy_(t_weight[start:end].to(self.data_type_))
             # the padding part is zero
-            self.weight[:, end:].zero_()
+            self.weight[end - start :].zero_()
             self.weight.load_ok = True
 
 
diff --git a/lightllm/models/vit/layer_infer/post_layer_infer.py b/lightllm/models/vit/layer_infer/post_layer_infer.py
index fa4a87f15..0eb0c4604 100644
--- a/lightllm/models/vit/layer_infer/post_layer_infer.py
+++ b/lightllm/models/vit/layer_infer/post_layer_infer.py
@@ -15,6 +15,7 @@ def __init__(self, network_config):
         self.network_config_ = network_config
         self.llm_hidden_size = network_config["llm_hidden_size"]
         self.downsample_ratio = network_config["downsample_ratio"]
+        self.eps_ = network_config["layer_norm_eps"]
         return
 
     def pixel_shuffle(self, x, scale_factor=0.5):
@@ -33,25 +34,12 @@ def forward(self, vit_embeds, layer_weight: ViTPreAndPostLayerWeight):
         h = w = int(vit_embeds.shape[1] ** 0.5)
         vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], h, w, -1)
         vit_embeds = self.pixel_shuffle(vit_embeds, scale_factor=self.downsample_ratio)
-        vit_embeds_norm = torch.nn.functional.layer_norm(
-            vit_embeds,
-            (vit_embeds.shape[-1],),
-            weight=layer_weight.layernorm_weight_,
-            bias=layer_weight.layernorm_bias_,
-        )
-
-        vit_embeds_1 = torch.addmm(
-            layer_weight.mlp1_1_bias_, vit_embeds_norm.view(-1, vit_embeds_norm.shape[-1]), layer_weight.mlp1_1_weight_
-        )
+        vit_embeds_norm = layer_weight.layernorm_weight_(input=vit_embeds, eps=self.eps_)
+        vit_embeds_1 = layer_weight.mlp1_1_.mm(vit_embeds_norm.view(-1, vit_embeds_norm.shape[-1]))
 
         vit_embeds_gelu = gelu_fwd(vit_embeds_1, use_custom_tensor_mananger=True)
 
-        vit_embeds_out = torch.addmm(
-            layer_weight.mlp1_3_bias_,
-            vit_embeds_gelu.view(-1, self.llm_hidden_size // self.tp_world_size_),
-            layer_weight.mlp1_3_weight_,
-            beta=1.0 / self.tp_world_size_,
-        )
+        vit_embeds_out = layer_weight.mlp1_3_.mm(vit_embeds_gelu.view(-1, self.llm_hidden_size // self.tp_world_size_))
 
         if self.tp_world_size_ == 1:
             return vit_embeds_out.view(batch_size, -1, self.llm_hidden_size)
diff --git a/lightllm/models/vit/layer_weights/pre_and_post_layer_weight.py b/lightllm/models/vit/layer_weights/pre_and_post_layer_weight.py
index 0d753aef8..a8d6fbbb4 100644
--- a/lightllm/models/vit/layer_weights/pre_and_post_layer_weight.py
+++ b/lightllm/models/vit/layer_weights/pre_and_post_layer_weight.py
@@ -4,7 +4,7 @@
 import torch.nn.functional as F
 from lightllm.common.basemodel import PreAndPostLayerWeight
 from lightllm.utils.dist_utils import get_current_device_id
-from lightllm.common.basemodel.layer_weights.meta_weights import LayerNormWeight
+from lightllm.common.basemodel.layer_weights.meta_weights import LayerNormWeight, COLMMWeight, ROWMMWeight
 
 
 class ViTPreAndPostLayerWeight(PreAndPostLayerWeight):
@@ -14,6 +14,7 @@ def __init__(self, data_type, network_config):
         self.image_size = self.network_config_["image_size"]
         self.patch_size = self.network_config_["patch_size"]
         self.llm_hidden_size = self.network_config_["llm_hidden_size"]
+        self.downsample_ratio = self.network_config_["downsample_ratio"]
         self.num_patches = (self.image_size // self.patch_size) ** 2
         self.num_positions = self.num_patches + 1
         self._create_weight()
@@ -33,22 +34,26 @@ def _create_weight(self):
         ).cuda()
         self.patch_embedding_bias_ = torch.empty(split_embed_dim, dtype=self.data_type_).cuda()
 
-        split_indexes_llm = np.linspace(0, self.llm_hidden_size, self.tp_world_size_ + 1, dtype=np.int64)
-        split_start_llm = split_indexes_llm[self.tp_rank_]
-        split_end_llm = split_indexes_llm[self.tp_rank_ + 1]
-        split_llm_hidden_size = split_end_llm - split_start_llm
-
-        self.mlp1_1_weight_ = torch.empty((self.llm_hidden_size, split_llm_hidden_size), dtype=self.data_type_).cuda()
-        self.mlp1_1_bias_ = torch.empty(split_llm_hidden_size, dtype=self.data_type_).cuda()
-        self.mlp1_3_weight_ = torch.empty((split_llm_hidden_size, self.llm_hidden_size), dtype=self.data_type_).cuda()
-        self.mlp1_3_bias_ = torch.empty(self.llm_hidden_size, dtype=self.data_type_).cuda()
-
         self.layernorm_weight_ = LayerNormWeight(
-            dim=self.embed_dim,
+            dim=self.embed_dim * int(1 / self.downsample_ratio) ** 2,
             weight_name="mlp1.0.weight",
             data_type=self.data_type_,
             bias_name="mlp1.0.bias",
         )
+        self.mlp1_1_ = ROWMMWeight(
+            in_dim=self.embed_dim * int(1 / self.downsample_ratio) ** 2,
+            out_dims=[self.llm_hidden_size],
+            weight_names=["mlp1.1.weight"],
+            data_type=self.data_type_,
+            bias_names=["mlp1.1.bias"],
+        )
+        self.mlp1_3_ = COLMMWeight(
+            in_dim=self.llm_hidden_size,
+            out_dims=[self.llm_hidden_size],
+            weight_names=["mlp1.3.weight"],
+            data_type=self.data_type_,
+            bias_names=["mlp1.3.bias"],
+        )
         return
 
     def _cuda(self, cpu_tensor):
@@ -90,21 +95,6 @@ def load_hf_weights(self, weights):
             self.patch_embedding_bias_.copy_(
                 weights["vision_model.embeddings.patch_embedding.bias"][split_start:split_end]
             )
-
-        split_indexes = np.linspace(0, self.llm_hidden_size, self.tp_world_size_ + 1, dtype=np.int64)
-        split_start = split_indexes[self.tp_rank_]
-        split_end = split_indexes[self.tp_rank_ + 1]
-
-        if "mlp1.1.weight" in weights:
-            self.mlp1_1_weight_.copy_(weights["mlp1.1.weight"][split_start:split_end, :].t())
-        if "mlp1.1.bias" in weights:
-            self.mlp1_1_bias_.copy_(weights["mlp1.1.bias"][split_start:split_end])
-
-        if "mlp1.3.weight" in weights:
-            self.mlp1_3_weight_.copy_(weights["mlp1.3.weight"][:, split_start:split_end].t())
-        if "mlp1.3.bias" in weights:
-            self.mlp1_3_bias_.copy_(weights["mlp1.3.bias"])
-
         return
 
     def verify_load(self):
diff --git a/lightllm/models/vit/layer_weights/transformer_layer_weight.py b/lightllm/models/vit/layer_weights/transformer_layer_weight.py
index 03ce2a7a3..786c0d2ff 100644
--- a/lightllm/models/vit/layer_weights/transformer_layer_weight.py
+++ b/lightllm/models/vit/layer_weights/transformer_layer_weight.py
@@ -138,13 +138,18 @@ def _init_norm(self):
             bias_name=self._ffn_norm_bias_name,
         )
         if self.qk_norm:
+            head_num = self.network_config_["num_attention_heads"]
+            head_dim = self.network_config_["hidden_size"] // head_num
+            head_dim = self.network_config_.get("head_dim", head_dim)
             self.q_norm_weight_ = TpRMSNormWeight(
-                dim=hidden_size,
+                head_num=head_num,
+                head_dim=head_dim,
                 weight_name=self._q_norm_weight_name,
                 data_type=self.data_type_,
             )
             self.k_norm_weight_ = TpRMSNormWeight(
-                dim=hidden_size,
+                head_num=head_num,
+                head_dim=head_dim,
                 weight_name=self._k_norm_weight_name,
                 data_type=self.data_type_,
             )

From 3aaf235dfe0e512c689740115da69c0c9431aae3 Mon Sep 17 00:00:00 2001
From: wangzaijun <wangzaijun@sensetime.com>
Date: Tue, 27 Jan 2026 03:15:44 +0000
Subject: [PATCH 44/65] mini fix typing

---
 .../basemodel/layer_weights/meta_weights/base_weight.py     | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/base_weight.py b/lightllm/common/basemodel/layer_weights/meta_weights/base_weight.py
index da0388786..714e7acf4 100644
--- a/lightllm/common/basemodel/layer_weights/meta_weights/base_weight.py
+++ b/lightllm/common/basemodel/layer_weights/meta_weights/base_weight.py
@@ -18,7 +18,7 @@ def _create_weight(self):
         pass
 
     @abstractmethod
-    def verify_load(self):
+    def verify_load(self) -> bool:
         pass
 
 
@@ -33,8 +33,8 @@ def __init__(self, tp_rank: int = None, tp_world_size: int = None, data_type: to
     def load_hf_weights(self, weights):
         raise NotImplementedError("load_hf_weights must implement this method")
 
-    def verify_load(self):
+    def verify_load(self) -> bool:
         raise NotImplementedError("verify_load must implement this method")
 
-    def _create_weight(self) -> bool:
+    def _create_weight(self):
         raise NotImplementedError("create_weight must implement this method")

From 74bb0abac0e44c34e6068f6afef1b81801506f73 Mon Sep 17 00:00:00 2001
From: shihaobai <1798930569@qq.com>
Date: Tue, 27 Jan 2026 06:36:37 +0000
Subject: [PATCH 45/65] internvl fix

---
 .../meta_weights/mm_weight/mm_weight.py       |  4 +-
 .../layer_weights/meta_weights/norm_weight.py |  2 +-
 .../layer_weights/transformer_layer_weight.py |  2 +-
 .../layer_weights/transformer_layer_weight.py |  4 ++
 .../pre_and_post_layer_weight.py              |  6 ++-
 .../layer_weights/transformer_layer_weight.py | 37 ++++++++++++-------
 lightllm/models/vit/model.py                  |  4 +-
 7 files changed, 40 insertions(+), 19 deletions(-)

diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/mm_weight.py b/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/mm_weight.py
index 9f34d5dfa..f9745232d 100644
--- a/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/mm_weight.py
+++ b/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/mm_weight.py
@@ -61,7 +61,7 @@ def mm(
         )
 
     def gen_weight_quant_param_names(self, quant_method: Optional[QuantizationMethod]):
-        if quant_method is None:
+        if quant_method.method_name == "none":
             self.weight_zero_point_names = None
             self.weight_scale_names = None
             return
@@ -82,7 +82,7 @@ def gen_weight_quant_param_names(self, quant_method: Optional[QuantizationMethod
                 quanted_weight_names.append(weight_name)
 
         if len(quanted_weight_names) != 0:
-            self.weight_names = quanted_weight_names
+            self.quanted_weight_names = quanted_weight_names
 
         if len(weight_scale_names) != 0:
             self.weight_scale_names = weight_scale_names
diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/norm_weight.py b/lightllm/common/basemodel/layer_weights/meta_weights/norm_weight.py
index 931ee92a7..4b02d3c30 100644
--- a/lightllm/common/basemodel/layer_weights/meta_weights/norm_weight.py
+++ b/lightllm/common/basemodel/layer_weights/meta_weights/norm_weight.py
@@ -9,7 +9,7 @@
 
 
 class RMSNormWeight(BaseWeightTpl, PlatformAwareOp):
-    def __init__(self, dim: int, weight_name: str, data_type: torch.dtype, **kwargs):
+    def __init__(self, dim: int, weight_name: str, data_type: torch.dtype):
         super().__init__()
         self.dim = dim
         self.weight_name = weight_name
diff --git a/lightllm/common/basemodel/layer_weights/transformer_layer_weight.py b/lightllm/common/basemodel/layer_weights/transformer_layer_weight.py
index c6ce1049f..86a887a25 100644
--- a/lightllm/common/basemodel/layer_weights/transformer_layer_weight.py
+++ b/lightllm/common/basemodel/layer_weights/transformer_layer_weight.py
@@ -15,7 +15,7 @@ def __init__(self, layer_num, data_type, network_config, quant_cfg):
         self.layer_num_ = layer_num
         self.data_type_ = data_type
         self.network_config_ = network_config
-        self.quant_cfg = quant_cfg
+        self.quant_cfg: Quantcfg = quant_cfg
         self._parse_config()
         self._init_weight_names()
         self._init_weight()
diff --git a/lightllm/models/internlm2/layer_weights/transformer_layer_weight.py b/lightllm/models/internlm2/layer_weights/transformer_layer_weight.py
index a05e977f1..e528ee9b5 100755
--- a/lightllm/models/internlm2/layer_weights/transformer_layer_weight.py
+++ b/lightllm/models/internlm2/layer_weights/transformer_layer_weight.py
@@ -21,6 +21,10 @@ def load_hf_weights(self, weights):
             del weights[qkv_weight_name]
         super().load_hf_weights(weights)
 
+    def _parse_config(self):
+        super()._parse_config()
+        self.n_kv_head = self.network_config_["num_key_value_heads"]
+
     def _init_weight_names(self):
         super()._init_weight_names()
         self._o_weight_name = f"model.layers.{self.layer_num_}.attention.wo.weight"
diff --git a/lightllm/models/vit/layer_weights/pre_and_post_layer_weight.py b/lightllm/models/vit/layer_weights/pre_and_post_layer_weight.py
index a8d6fbbb4..73eb0b46a 100644
--- a/lightllm/models/vit/layer_weights/pre_and_post_layer_weight.py
+++ b/lightllm/models/vit/layer_weights/pre_and_post_layer_weight.py
@@ -5,10 +5,11 @@
 from lightllm.common.basemodel import PreAndPostLayerWeight
 from lightllm.utils.dist_utils import get_current_device_id
 from lightllm.common.basemodel.layer_weights.meta_weights import LayerNormWeight, COLMMWeight, ROWMMWeight
+from lightllm.common.quantization import Quantcfg
 
 
 class ViTPreAndPostLayerWeight(PreAndPostLayerWeight):
-    def __init__(self, data_type, network_config):
+    def __init__(self, data_type, network_config, quant_cfg):
         super().__init__(data_type, network_config)
         self.embed_dim = self.network_config_["hidden_size"]
         self.image_size = self.network_config_["image_size"]
@@ -17,6 +18,7 @@ def __init__(self, data_type, network_config):
         self.downsample_ratio = self.network_config_["downsample_ratio"]
         self.num_patches = (self.image_size // self.patch_size) ** 2
         self.num_positions = self.num_patches + 1
+        self.quant_cfg: Quantcfg = quant_cfg
         self._create_weight()
         return
 
@@ -46,6 +48,7 @@ def _create_weight(self):
             weight_names=["mlp1.1.weight"],
             data_type=self.data_type_,
             bias_names=["mlp1.1.bias"],
+            quant_method=self.quant_cfg.get_quant_method(-1, "mlp1_1"),
         )
         self.mlp1_3_ = COLMMWeight(
             in_dim=self.llm_hidden_size,
@@ -53,6 +56,7 @@ def _create_weight(self):
             weight_names=["mlp1.3.weight"],
             data_type=self.data_type_,
             bias_names=["mlp1.3.bias"],
+            quant_method=self.quant_cfg.get_quant_method(-1, "mlp1_3"),
         )
         return
 
diff --git a/lightllm/models/vit/layer_weights/transformer_layer_weight.py b/lightllm/models/vit/layer_weights/transformer_layer_weight.py
index 786c0d2ff..198b3022b 100644
--- a/lightllm/models/vit/layer_weights/transformer_layer_weight.py
+++ b/lightllm/models/vit/layer_weights/transformer_layer_weight.py
@@ -123,20 +123,31 @@ def _init_ffn(self):
         )
 
     def _init_norm(self):
-        norm_weight_cls = RMSNormWeight if self.norm_type == "rms_norm" else LayerNormWeight
         hidden_size = self.network_config_["hidden_size"]
-        self.att_norm_weight_ = norm_weight_cls(
-            dim=hidden_size,
-            weight_name=self._att_norm_weight_name,
-            data_type=self.data_type_,
-            bias_name=self._att_norm_bias_name,
-        )
-        self.ffn_norm_weight_ = norm_weight_cls(
-            dim=hidden_size,
-            weight_name=self._ffn_norm_weight_name,
-            data_type=self.data_type_,
-            bias_name=self._ffn_norm_bias_name,
-        )
+        if self.norm_type == "rms_norm":
+            self.att_norm_weight_ = RMSNormWeight(
+                dim=hidden_size,
+                weight_name=self._att_norm_weight_name,
+                data_type=self.data_type_,
+            )
+            self.ffn_norm_weight_ = RMSNormWeight(
+                dim=hidden_size,
+                weight_name=self._ffn_norm_weight_name,
+                data_type=self.data_type_,
+            )
+        else:
+            self.att_norm_weight_ = LayerNormWeight(
+                dim=hidden_size,
+                weight_name=self._att_norm_weight_name,
+                data_type=self.data_type_,
+                bias_name=self._att_norm_bias_name,
+            )
+            self.ffn_norm_weight_ = LayerNormWeight(
+                dim=hidden_size,
+                weight_name=self._ffn_norm_weight_name,
+                data_type=self.data_type_,
+                bias_name=self._ffn_norm_bias_name,
+            )
         if self.qk_norm:
             head_num = self.network_config_["num_attention_heads"]
             head_dim = self.network_config_["hidden_size"] // head_num
diff --git a/lightllm/models/vit/model.py b/lightllm/models/vit/model.py
index 9c2bc4242..13f8e2827 100644
--- a/lightllm/models/vit/model.py
+++ b/lightllm/models/vit/model.py
@@ -111,7 +111,9 @@ def _padding_hidden_size(self):
         return
 
     def _init_weights(self):
-        self.pre_post_weight = self.pre_and_post_weight_class(self.data_type, network_config=self.config)
+        self.pre_post_weight = self.pre_and_post_weight_class(
+            self.data_type, network_config=self.config, quant_cfg=self.quant_cfg
+        )
         self.trans_layers_weight = [
             self.transformer_weight_class(
                 i,

From 34dd483bd34e912a65b77f3c10247f58a59b86b6 Mon Sep 17 00:00:00 2001
From: wangzaijun <wangzaijun@sensetime.com>
Date: Tue, 27 Jan 2026 06:38:41 +0000
Subject: [PATCH 46/65] fix

---
 .../layer_weights/meta_weights/norm_weight.py | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/norm_weight.py b/lightllm/common/basemodel/layer_weights/meta_weights/norm_weight.py
index 4b02d3c30..c922bffc4 100644
--- a/lightllm/common/basemodel/layer_weights/meta_weights/norm_weight.py
+++ b/lightllm/common/basemodel/layer_weights/meta_weights/norm_weight.py
@@ -10,7 +10,7 @@
 
 class RMSNormWeight(BaseWeightTpl, PlatformAwareOp):
     def __init__(self, dim: int, weight_name: str, data_type: torch.dtype):
-        super().__init__()
+        super().__init__(tp_rank=0, tp_world_size=1)
         self.dim = dim
         self.weight_name = weight_name
         self.data_type_ = data_type
@@ -73,7 +73,7 @@ def __call__(
 
 class LayerNormWeight(BaseWeightTpl, PlatformAwareOp):
     def __init__(self, dim: int, weight_name: str, data_type: torch.dtype, bias_name: str = None):
-        super().__init__()
+        super().__init__(tp_rank=0, tp_world_size=1)
         self.dim = dim
         self.weight_name = weight_name
         self.bias_name = bias_name
@@ -114,11 +114,11 @@ def _triton_forward(
         self, input: torch.Tensor, eps: float, out: Optional[torch.Tensor] = None, alloc_func=torch.empty
     ) -> torch.Tensor:
         # assert input.ndim == 2 and self.weight.ndim == 1
-        print(input.shape)
         if out is None:
-            out = alloc_func(input.shape, dtype=input.dtype, device=input.device)
-        out[:] = layernorm_forward(x=input, weight=self.weight, bias=self.bias, eps=eps)
-        return out
+            return layernorm_forward(x=input, weight=self.weight, bias=self.bias, eps=eps)
+        else:
+            out.copy_(layernorm_forward(x=input, weight=self.weight, bias=self.bias, eps=eps))
+            return out
 
     def _cuda_forward(
         self, input: torch.Tensor, eps: float, out: Optional[torch.Tensor] = None, alloc_func=torch.empty
@@ -143,6 +143,9 @@ def __init__(self, head_num, head_dim, weight_name: str, data_type: torch.dtype)
         padded_head_num = self._get_tp_padded_head_num(head_num)
         dim = padded_head_num * head_dim
         super().__init__(dim=dim, weight_name=weight_name, data_type=data_type)
+        # 重新初始化 tp rank 的信息， load hf weights 的时候会用到
+        self.tp_rank_ = get_current_rank_in_dp()
+        self.tp_world_size_ = get_dp_world_size()
         self.repeat_times_ = 1
 
     def _get_tp_padded_head_num(self, head_num: int):
@@ -185,8 +188,6 @@ def load_hf_weights(self, weights):
 class NoTpGEMMANormWeight(RMSNormWeight):
     def __init__(self, dim: int, weight_name: str, data_type: torch.dtype):
         super().__init__(dim=dim, weight_name=weight_name, data_type=data_type)
-        self.tp_world_size_ = 1
-        self.tp_rank_ = 0
 
     def load_hf_weights(self, weights: Dict[str, torch.Tensor]):
         if self.weight_name in weights:
@@ -197,8 +198,6 @@ def load_hf_weights(self, weights: Dict[str, torch.Tensor]):
 class QKRMSNORMWeight(RMSNormWeight):
     def __init__(self, dim: int, weight_name: str, data_type: torch.dtype):
         super().__init__(dim=dim, weight_name=weight_name, data_type=data_type)
-        self.tp_world_size_ = 1
-        self.tp_rank_ = 0
 
     def _native_forward(
         self,

From 991aa56fd0e0c675b9797645b9432ba507fa97f5 Mon Sep 17 00:00:00 2001
From: wangzaijun <wangzaijun@sensetime.com>
Date: Tue, 27 Jan 2026 06:42:21 +0000
Subject: [PATCH 47/65] start_args_type.py add enable_ep_moe

---
 lightllm/server/core/objs/start_args_type.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lightllm/server/core/objs/start_args_type.py b/lightllm/server/core/objs/start_args_type.py
index 239cebfdd..059cd739f 100644
--- a/lightllm/server/core/objs/start_args_type.py
+++ b/lightllm/server/core/objs/start_args_type.py
@@ -127,6 +127,7 @@ class StartArgs:
     penalty_counter_mode: str = field(
         default="gpu_counter", metadata={"choices": ["cpu_counter", "pin_mem_counter", "gpu_counter"]}
     )
+    enable_ep_moe: bool = field(default=False)
     ep_redundancy_expert_config_path: Optional[str] = field(default=None)
     auto_update_redundancy_expert: bool = field(default=False)
     mtp_mode: Optional[str] = field(

From 4d4d14a78e29d67e59ddfc54497cc1a382c74c3e Mon Sep 17 00:00:00 2001
From: wangzaijun <wangzaijun@sensetime.com>
Date: Tue, 27 Jan 2026 06:44:50 +0000
Subject: [PATCH 48/65] add log.

---
 .../meta_weights/fused_moe/gpt_oss_fused_moe_weight_tp.py      | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/gpt_oss_fused_moe_weight_tp.py b/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/gpt_oss_fused_moe_weight_tp.py
index 666419f9c..b835cffbd 100644
--- a/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/gpt_oss_fused_moe_weight_tp.py
+++ b/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/gpt_oss_fused_moe_weight_tp.py
@@ -81,6 +81,9 @@ def __init__(
         return
 
     def _create_weight(self):
+        """
+        因为加载方式比较特殊，不在这里创建weight。
+        """
         pass
 
     def _fuse_weight_scale(self):

From be2264de7d22daf8f5de0ef9457001a0abd3ca97 Mon Sep 17 00:00:00 2001
From: shihaobai <1798930569@qq.com>
Date: Tue, 27 Jan 2026 06:51:30 +0000
Subject: [PATCH 49/65] fix mm weight

---
 .../meta_weights/mm_weight/mm_weight.py       | 38 ++++++++++---------
 1 file changed, 20 insertions(+), 18 deletions(-)

diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/mm_weight.py b/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/mm_weight.py
index f9745232d..60e5d16b4 100644
--- a/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/mm_weight.py
+++ b/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/mm_weight.py
@@ -51,7 +51,7 @@ def __init__(
         self.quant_method: QuantizationMethod = NoQuantization() if quant_method is None else quant_method
         self.param_slicer: SliceMixinTpl = None
         self._create_weight()
-        self.gen_weight_quant_param_names(quant_method=quant_method)
+        self.gen_weight_quant_param_names()
 
     def mm(
         self, input_tensor: torch.Tensor, out: Optional[torch.Tensor] = None, use_custom_tensor_mananger: bool = True
@@ -60,10 +60,11 @@ def mm(
             input_tensor, self.mm_param, out, use_custom_tensor_mananger=use_custom_tensor_mananger, bias=self.bias
         )
 
-    def gen_weight_quant_param_names(self, quant_method: Optional[QuantizationMethod]):
-        if quant_method.method_name == "none":
-            self.weight_zero_point_names = None
-            self.weight_scale_names = None
+    def gen_weight_quant_param_names(self):
+        if self.quant_method.method_name == "none":
+            self.quanted_weight_names = [None] * len(self.weight_names)
+            self.weight_zero_point_names = [None] * len(self.weight_names)
+            self.weight_scale_names = [None] * len(self.weight_names)
             return
 
         quanted_weight_names = []
@@ -71,14 +72,14 @@ def gen_weight_quant_param_names(self, quant_method: Optional[QuantizationMethod
         weight_zero_point_names = []
 
         for weight_name in self.weight_names:
-            if quant_method.weight_scale_suffix is not None:
-                weight_scale_name = weight_name.replace("weight", quant_method.weight_scale_suffix)
+            if self.quant_method.weight_scale_suffix is not None:
+                weight_scale_name = weight_name.replace("weight", self.quant_method.weight_scale_suffix)
                 weight_scale_names.append(weight_scale_name)
-            if quant_method.weight_zero_point_suffix is not None:
-                weight_zero_point_name = weight_name.replace("weight", quant_method.weight_zero_point_suffix)
+            if self.quant_method.weight_zero_point_suffix is not None:
+                weight_zero_point_name = weight_name.replace("weight", self.quant_method.weight_zero_point_suffix)
                 weight_zero_point_names.append(weight_zero_point_name)
-            if quant_method.weight_suffix is not None:
-                weight_name = weight_name.replace("weight", quant_method.weight_suffix)
+            if self.quant_method.weight_suffix is not None:
+                weight_name = weight_name.replace("weight", self.quant_method.weight_suffix)
                 quanted_weight_names.append(weight_name)
 
         if len(quanted_weight_names) != 0:
@@ -99,16 +100,13 @@ def load_hf_weights(self, weights):
 
         for sub_child_index, param_name in enumerate(self.weight_names):
             self._load_weight(param_name=param_name, weights=weights, sub_child_index=sub_child_index)
-
+        for sub_child_index, param_name in enumerate(self.weight_scale_names):
+            self._load_weight_scale(param_name=param_name, weights=weights, sub_child_index=sub_child_index)
+        for sub_child_index, param_name in enumerate(self.weight_zero_point_names):
+            self._load_weight_zero_point(param_name=param_name, weights=weights, sub_child_index=sub_child_index)
         if self.bias_names is not None:
             for sub_child_index, param_name in enumerate(self.bias_names):
                 self._load_bias(param_name=param_name, weights=weights, sub_child_index=sub_child_index)
-        if self.weight_scale_names is not None:
-            for sub_child_index, param_name in enumerate(self.weight_scale_names):
-                self._load_weight_scale(param_name=param_name, weights=weights, sub_child_index=sub_child_index)
-        if self.weight_zero_point_names is not None:
-            for sub_child_index, param_name in enumerate(self.weight_zero_point_names):
-                self._load_weight_zero_point(param_name=param_name, weights=weights, sub_child_index=sub_child_index)
 
     def _create_weight(self):
         self.bias = None
@@ -129,6 +127,10 @@ def _create_weight(self):
     def _load_weight(
         self, param_name: Union[str, List[str]], weights: Dict[str, torch.Tensor], sub_child_index: int
     ) -> None:
+        quanted_param_name = self.quanted_weight_names[sub_child_index]
+        # if the original weight is quantized, use the quantized_param_name.
+        if quanted_param_name in weights:
+            param_name = quanted_param_name
         if param_name in weights:
             weight = self.param_slicer._slice_weight(weights[param_name])
             self.quant_method.load_weight(weight, self.mm_param_list[sub_child_index])

From a6ecf9679cdf9f58196ba3667f1f133aedf284ed Mon Sep 17 00:00:00 2001
From: wangzaijun <wangzaijun@sensetime.com>
Date: Tue, 27 Jan 2026 07:16:12 +0000
Subject: [PATCH 50/65] moe matmul use per token quant for all.

---
 .../fused_moe/grouped_fused_moe.py              | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/lightllm/common/basemodel/triton_kernel/fused_moe/grouped_fused_moe.py b/lightllm/common/basemodel/triton_kernel/fused_moe/grouped_fused_moe.py
index ba9197261..97075e912 100644
--- a/lightllm/common/basemodel/triton_kernel/fused_moe/grouped_fused_moe.py
+++ b/lightllm/common/basemodel/triton_kernel/fused_moe/grouped_fused_moe.py
@@ -387,7 +387,7 @@ def grouped_matmul_kernel(
     k,  # int
     n,  # int
     topk_num,  # int
-    token_scale_ptr,  # [1,] for per tensor quant, or [token_num, hidden_dim // block_size] for per token, group quant
+    token_scale_ptr,  # [token_num,] for pertoken quant, or [token_num,hidden_dim//block_size] for per group quant
     weight_scale_ptr,  # [expert_num, n] or [export_num, n // block_size_n, k // block_size_k]
     weight_scale_stride0,
     weight_scale_stride1,
@@ -497,7 +497,14 @@ def grouped_matmul_kernel(
 
             b_scale_ptrs = weight_scale_ptr + expert_id * weight_scale_stride0 + offs_bsn * weight_scale_stride1
         else:
-            a_scale = tl.load(token_scale_ptr, eviction_policy="evict_last")
+            # per token scale quant
+            if TOKEN_INPUT_USE_TMA:
+                assert MUL_ROUTED_WEIGHT is True
+                a_scale_ptrs = token_scale_ptr + (token_start_index + tl.arange(0, BLOCK_SIZE_M))[:, None]
+            else:
+                a_scale_ptrs = token_scale_ptr + (a_m_index // topk_num)[:, None]
+
+            a_scale = tl.load(a_scale_ptrs, eviction_policy="evict_last")
             b_scale = tl.load(
                 weight_scale_ptr + expert_id * weight_scale_stride0 + offs_bn[None, :] * weight_scale_stride1,
                 eviction_policy="evict_last",
@@ -748,8 +755,12 @@ def grouped_matmul(
     if use_fp8_w8a8:
         # 当权重使用 block wise 量化时，激活也使用 per token， group size 量化
         if block_size_k == 0:
-            token_inputs, token_input_scale = vllm_ops.scaled_fp8_quant(token_inputs, token_input_scale)
+            # input 使用 per token 量化
+            token_inputs, token_input_scale = vllm_ops.scaled_fp8_quant(
+                token_inputs, token_input_scale, use_per_token_if_dynamic=True
+            )
         else:
+            # input 使用 per group quant 量化
             _m, _k = token_inputs.shape
             assert _k % block_size_k == 0
             token_inputs, token_input_scale = per_token_group_quant_fp8(

From 64bf50156114bfe729926faff62c1cf989ee34ee Mon Sep 17 00:00:00 2001
From: shihaobai <1798930569@qq.com>
Date: Tue, 27 Jan 2026 08:29:31 +0000
Subject: [PATCH 51/65] fix mm weight

---
 .../meta_weights/mm_weight/mm_weight.py       | 33 ++++---------------
 1 file changed, 7 insertions(+), 26 deletions(-)

diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/mm_weight.py b/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/mm_weight.py
index 60e5d16b4..3630bc2c0 100644
--- a/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/mm_weight.py
+++ b/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/mm_weight.py
@@ -61,39 +61,20 @@ def mm(
         )
 
     def gen_weight_quant_param_names(self):
-        if self.quant_method.method_name == "none":
-            self.quanted_weight_names = [None] * len(self.weight_names)
-            self.weight_zero_point_names = [None] * len(self.weight_names)
-            self.weight_scale_names = [None] * len(self.weight_names)
-            return
+        self.quanted_weight_names = [None] * len(self.weight_names)
+        self.weight_zero_point_names = [None] * len(self.weight_names)
+        self.weight_scale_names = [None] * len(self.weight_names)
 
-        quanted_weight_names = []
-        weight_scale_names = []
-        weight_zero_point_names = []
-
-        for weight_name in self.weight_names:
+        for sub_child_index, weight_name in enumerate(self.weight_names):
             if self.quant_method.weight_scale_suffix is not None:
                 weight_scale_name = weight_name.replace("weight", self.quant_method.weight_scale_suffix)
-                weight_scale_names.append(weight_scale_name)
+                self.weight_scale_names[sub_child_index] = weight_scale_name
             if self.quant_method.weight_zero_point_suffix is not None:
                 weight_zero_point_name = weight_name.replace("weight", self.quant_method.weight_zero_point_suffix)
-                weight_zero_point_names.append(weight_zero_point_name)
+                self.weight_zero_point_names[sub_child_index] = weight_zero_point_name
             if self.quant_method.weight_suffix is not None:
                 weight_name = weight_name.replace("weight", self.quant_method.weight_suffix)
-                quanted_weight_names.append(weight_name)
-
-        if len(quanted_weight_names) != 0:
-            self.quanted_weight_names = quanted_weight_names
-
-        if len(weight_scale_names) != 0:
-            self.weight_scale_names = weight_scale_names
-        else:
-            self.weight_scale_names = None
-
-        if len(weight_zero_point_names) != 0:
-            self.weight_zero_point_names = weight_zero_point_names
-        else:
-            self.weight_zero_point_names = None
+                self.quanted_weight_names[sub_child_index] = weight_name
         return
 
     def load_hf_weights(self, weights):

From 89cd6db8b54b31bd8132a480ed7614a9d05f073c Mon Sep 17 00:00:00 2001
From: wangzaijun <wangzaijun@sensetime.com>
Date: Tue, 27 Jan 2026 08:30:23 +0000
Subject: [PATCH 52/65] fix

---
 lightllm/common/quantization/w8a8.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lightllm/common/quantization/w8a8.py b/lightllm/common/quantization/w8a8.py
index db79b4e31..9d80e8ab2 100644
--- a/lightllm/common/quantization/w8a8.py
+++ b/lightllm/common/quantization/w8a8.py
@@ -72,7 +72,7 @@ def quantize(self, weight: torch.Tensor, output: WeightPack) -> None:
         weight = weight.float().cuda(self.device_id_)
         scale = weight.abs().max(dim=-1)[0] / 127
         weight = weight / scale.reshape(-1, 1)
-        weight = torch.round(weight.clamp(min=-128, max=127)).to(dtype=torch.int8)
+        weight = torch.round(weight.clamp(min=-127, max=127)).to(dtype=torch.int8)
         output.weight.copy_(weight)
         output.weight_scale.copy_(scale)
         return

From 9683167efc1ffcc3bab930b8174fc2afe831f2cd Mon Sep 17 00:00:00 2001
From: shihaobai <1798930569@qq.com>
Date: Tue, 27 Jan 2026 09:01:55 +0000
Subject: [PATCH 53/65] fix tpsp ep

---
 .../meta_weights/fused_moe/fused_moe_weight.py   | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/fused_moe_weight.py b/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/fused_moe_weight.py
index 926ea30a6..49b8ec8f6 100644
--- a/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/fused_moe_weight.py
+++ b/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/fused_moe_weight.py
@@ -46,12 +46,6 @@ def __init__(
         self.hidden_size = hidden_size
         self.moe_intermediate_size = moe_intermediate_size
         self.quant_method = quant_method
-        self.row_slicer = get_row_slice_mixin(
-            self.quant_method.method_name, tp_rank=self.tp_rank_, tp_world_size=self.tp_world_size_
-        )
-        self.col_slicer = get_col_slice_mixin(
-            self.quant_method.method_name, tp_rank=self.tp_rank_, tp_world_size=self.tp_world_size_
-        )
         assert num_fused_shared_experts in [0, 1], "num_fused_shared_experts can only support 0 or 1 now."
         self.enable_ep_moe = get_env_start_args().enable_ep_moe
         self.n_routed_experts = n_routed_experts
@@ -91,6 +85,15 @@ def _init_redundancy_expert_params(self):
         assert self.redundancy_expert_num != 1, "redundancy_expert_num can not be 1 for some unknown hang of deepep."
 
     def _init_parallel_params(self):
+        if self.enable_ep_moe:
+            self.tp_rank_ = 0
+            self.tp_world_size_ = 1
+        self.row_slicer = get_row_slice_mixin(
+            self.quant_method.method_name, tp_rank=self.tp_rank_, tp_world_size=self.tp_world_size_
+        )
+        self.col_slicer = get_col_slice_mixin(
+            self.quant_method.method_name, tp_rank=self.tp_rank_, tp_world_size=self.tp_world_size_
+        )
         self.local_n_routed_experts = self.n_routed_experts + self.num_fused_shared_experts
         self.split_inter_size = self.moe_intermediate_size // self.tp_world_size_
         if self.enable_ep_moe:
@@ -100,7 +103,6 @@ def _init_parallel_params(self):
                 f"redundancy_expertids: {self.redundancy_expert_ids}"
             )
             self.local_n_routed_experts = self.n_routed_experts // self.global_world_size + self.redundancy_expert_num
-            self.split_inter_size = self.moe_intermediate_size
             n_experts_per_rank = self.n_routed_experts // self.global_world_size
             start_expert_id = self.global_rank_ * n_experts_per_rank
             self.local_expert_ids = (

From 22f0d2ff60420791322e82cb198d0ac0befa6d75 Mon Sep 17 00:00:00 2001
From: wangzaijun <wangzaijun@sensetime.com>
Date: Tue, 27 Jan 2026 09:14:25 +0000
Subject: [PATCH 54/65] fix

---
 .../meta_weights/fused_moe/gpt_oss_fused_moe_weight_tp.py     | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/gpt_oss_fused_moe_weight_tp.py b/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/gpt_oss_fused_moe_weight_tp.py
index b835cffbd..6ed0cef0b 100644
--- a/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/gpt_oss_fused_moe_weight_tp.py
+++ b/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/gpt_oss_fused_moe_weight_tp.py
@@ -125,6 +125,10 @@ def load_hf_weights(self, weights):
             w2_bias = weights[self._down_bias_name]
             self.w2_bias = self._cuda(w2_bias)
 
+    def verify_load(self):
+        assert self.w1 is not None and self.w2 is not None
+        return True
+
     def _router(self, router_logits, top_k):
         router_top_value, router_indices = torch.topk(router_logits, top_k, dim=-1)
         router_top_value = torch.nn.functional.softmax(router_top_value, dim=1, dtype=router_top_value.dtype)

From 4fd670110c2396d25cbba32c485ae6224fb8eb52 Mon Sep 17 00:00:00 2001
From: shihaobai <1798930569@qq.com>
Date: Tue, 27 Jan 2026 09:20:12 +0000
Subject: [PATCH 55/65] fix mtp

---
 .../layer_weights/pre_and_post_layer_weight.py     |  9 ++++++---
 lightllm/models/deepseek_mtp/model.py              | 13 ++++++++++++-
 .../layer_weights/pre_and_post_layer_weight.py     |  9 ++++++---
 lightllm/models/mistral_mtp/model.py               | 14 ++++++++++++--
 .../layer_weights/pre_and_post_layer_weight.py     |  9 ++++++---
 lightllm/models/qwen3_moe_mtp/model.py             | 13 ++++++++++++-
 6 files changed, 54 insertions(+), 13 deletions(-)

diff --git a/lightllm/models/deepseek_mtp/layer_weights/pre_and_post_layer_weight.py b/lightllm/models/deepseek_mtp/layer_weights/pre_and_post_layer_weight.py
index 1df695df0..91c0b2b3f 100644
--- a/lightllm/models/deepseek_mtp/layer_weights/pre_and_post_layer_weight.py
+++ b/lightllm/models/deepseek_mtp/layer_weights/pre_and_post_layer_weight.py
@@ -5,17 +5,20 @@
     RMSNormWeight,
     ROWMMWeight,
 )
+from lightllm.common.quantization import Quantcfg
 
 
 class Deepseek3MTPPreAndPostLayerWeight(PreAndPostLayerWeight):
-    def __init__(self, data_type, network_config):
+    def __init__(self, data_type, network_config, quant_cfg: Quantcfg):
         super().__init__(data_type, network_config)
-
+        self.quant_cfg: Quantcfg = quant_cfg
         hidden_size = network_config["hidden_size"]
         self.eh_proj_weight_ = ROWMMWeight(
+            in_dim=hidden_size * 2,
+            out_dims=[hidden_size],
             weight_names="model.layers.0.eh_proj.weight",
             data_type=self.data_type_,
-            name="eh_proj",
+            quant_method=self.quant_cfg.get_quant_method(0, "eh_proj"),
             tp_rank=0,
             tp_world_size=1,
         )
diff --git a/lightllm/models/deepseek_mtp/model.py b/lightllm/models/deepseek_mtp/model.py
index 0204e292a..d9ffdb0e3 100644
--- a/lightllm/models/deepseek_mtp/model.py
+++ b/lightllm/models/deepseek_mtp/model.py
@@ -35,7 +35,18 @@ def _init_mem_manager(self):
 
     def _init_weights(self, start_layer_index=None):
         assert start_layer_index is None
-        super()._init_weights(start_layer_index=0)
+        self.pre_post_weight = self.pre_and_post_weight_class(
+            self.data_type, network_config=self.config, quant_cfg=self.quant_cfg
+        )
+        self.trans_layers_weight = [
+            self.transformer_weight_class(
+                i,
+                self.data_type,
+                network_config=self.config,
+                quant_cfg=self.quant_cfg,
+            )
+            for i in range(0, self.config["n_layer"])
+        ]
         self.pre_post_weight.wte_weight_ = self.main_model.pre_post_weight.wte_weight_
         self.pre_post_weight.lm_head_weight_ = self.main_model.pre_post_weight.lm_head_weight_
         return
diff --git a/lightllm/models/mistral_mtp/layer_weights/pre_and_post_layer_weight.py b/lightllm/models/mistral_mtp/layer_weights/pre_and_post_layer_weight.py
index 0b6dcf137..5ec5bf7c1 100644
--- a/lightllm/models/mistral_mtp/layer_weights/pre_and_post_layer_weight.py
+++ b/lightllm/models/mistral_mtp/layer_weights/pre_and_post_layer_weight.py
@@ -5,17 +5,20 @@
     RMSNormWeight,
     ROWMMWeight,
 )
+from lightllm.common.quantization import Quantcfg
 
 
 class MistralMTPPreAndPostLayerWeight(PreAndPostLayerWeight):
-    def __init__(self, data_type, network_config):
+    def __init__(self, data_type, network_config, quant_cfg: Quantcfg):
         super().__init__(data_type, network_config)
+        self.quant_cfg: Quantcfg = quant_cfg
         hidden_size = network_config["hidden_size"]
         self.eh_proj_weight_ = ROWMMWeight(
+            in_dim=hidden_size * 2,
+            out_dims=[hidden_size],
             weight_names="mtp.eh_proj.weight",
             data_type=self.data_type_,
-            layer_num=0,
-            name="eh_proj",
+            quant_method=self.quant_cfg.get_quant_method(0, "eh_proj"),
             tp_rank=0,
             tp_world_size=1,
         )
diff --git a/lightllm/models/mistral_mtp/model.py b/lightllm/models/mistral_mtp/model.py
index 0132db80f..7c64625ca 100644
--- a/lightllm/models/mistral_mtp/model.py
+++ b/lightllm/models/mistral_mtp/model.py
@@ -48,9 +48,19 @@ def _init_mem_manager(self):
 
     def _init_weights(self, start_layer_index=None):
         assert start_layer_index is None
-
         self.config["n_layer"] = 1
-        super()._init_weights(start_layer_index=0)
+        self.pre_post_weight = self.pre_and_post_weight_class(
+            self.data_type, network_config=self.config, quant_cfg=self.quant_cfg
+        )
+        self.trans_layers_weight = [
+            self.transformer_weight_class(
+                i,
+                self.data_type,
+                network_config=self.config,
+                quant_cfg=self.quant_cfg,
+            )
+            for i in range(0, self.config["n_layer"])
+        ]
         self.pre_post_weight.wte_weight_ = self.main_model.pre_post_weight.wte_weight_
         self.pre_post_weight.lm_head_weight_ = self.main_model.pre_post_weight.lm_head_weight_
         self.pre_post_weight.final_norm_weight_ = self.main_model.pre_post_weight.final_norm_weight_
diff --git a/lightllm/models/qwen3_moe_mtp/layer_weights/pre_and_post_layer_weight.py b/lightllm/models/qwen3_moe_mtp/layer_weights/pre_and_post_layer_weight.py
index 924f01c46..3038a4d07 100644
--- a/lightllm/models/qwen3_moe_mtp/layer_weights/pre_and_post_layer_weight.py
+++ b/lightllm/models/qwen3_moe_mtp/layer_weights/pre_and_post_layer_weight.py
@@ -6,17 +6,20 @@
     LMHeadWeight,
     RMSNormWeight,
 )
+from lightllm.common.quantization import Quantcfg
 
 
 class Qwen3MOEMTPPreAndPostLayerWeight(PreAndPostLayerWeight):
-    def __init__(self, data_type, network_config):
+    def __init__(self, data_type, network_config, quant_cfg: Quantcfg):
         super().__init__(data_type, network_config)
-
+        self.quant_cfg: Quantcfg = quant_cfg
         hidden_size = network_config["hidden_size"]
         self.eh_proj_weight_ = ROWMMWeight(
+            in_dim=hidden_size * 2,
+            out_dims=[hidden_size],
             weight_names="model.layers.0.proj.weight",
+            quant_method=self.quant_cfg.get_quant_method(0, "eh_proj"),
             data_type=self.data_type_,
-            name="eh_proj",
             tp_rank=0,
             tp_world_size=1,
         )
diff --git a/lightllm/models/qwen3_moe_mtp/model.py b/lightllm/models/qwen3_moe_mtp/model.py
index 72aadbda8..9f83832a7 100644
--- a/lightllm/models/qwen3_moe_mtp/model.py
+++ b/lightllm/models/qwen3_moe_mtp/model.py
@@ -41,7 +41,18 @@ def _init_mem_manager(self):
     def _init_weights(self, start_layer_index=None):
         assert start_layer_index is None
         mtp_index = len(self.mtp_previous_draft_models)
-        super()._init_weights(start_layer_index=mtp_index)
+        self.pre_post_weight = self.pre_and_post_weight_class(
+            self.data_type, network_config=self.config, quant_cfg=self.quant_cfg
+        )
+        self.trans_layers_weight = [
+            self.transformer_weight_class(
+                i,
+                self.data_type,
+                network_config=self.config,
+                quant_cfg=self.quant_cfg,
+            )
+            for i in range(mtp_index, mtp_index + self.config["n_layer"])
+        ]
         self.pre_post_weight.wte_weight_ = self.main_model.pre_post_weight.wte_weight_
         self.pre_post_weight.lm_head_weight_ = self.main_model.pre_post_weight.lm_head_weight_
         self.pre_post_weight.final_norm_weight_ = self.main_model.pre_post_weight.final_norm_weight_

From a2dd0729f5a1499d2a2a43cd78eb466f97a6eb90 Mon Sep 17 00:00:00 2001
From: wangzaijun <wangzaijun@sensetime.com>
Date: Tue, 27 Jan 2026 10:56:26 +0000
Subject: [PATCH 56/65] fix redundancy

---
 .../meta_weights/fused_moe/ep_redundancy.py   | 84 +++++++++++--------
 1 file changed, 47 insertions(+), 37 deletions(-)

diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/ep_redundancy.py b/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/ep_redundancy.py
index a31cd1880..749400c8d 100644
--- a/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/ep_redundancy.py
+++ b/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/ep_redundancy.py
@@ -25,12 +25,13 @@ def prepare_redundancy_experts(
     ):
         expert_counter = self._ep_w.routed_expert_counter_tensor.detach().cpu().numpy()
         logger.info(
-            f"layer_index {self._ep_w.layer_num} global_rank {self._ep_w.global_rank_} expert_counter: {expert_counter}"
+            f"layer_index {self._ep_w.layer_num_} global_rank {self._ep_w.global_rank_}"
+            f" expert_counter: {expert_counter}"
         )
         self._ep_w.routed_expert_counter_tensor.fill_(0)
-
-        start_expert_id = self._ep_w.ep_n_routed_experts * self._ep_w.global_rank_
-        no_redundancy_expert_ids = list(range(start_expert_id, start_expert_id + self._ep_w.ep_n_routed_experts))
+        ep_n_routed_experts = self._ep_w.n_routed_experts // self._ep_w.global_world_size
+        start_expert_id = ep_n_routed_experts * self._ep_w.global_rank_
+        no_redundancy_expert_ids = list(range(start_expert_id, start_expert_id + ep_n_routed_experts))
 
         # 统计 0 rank 上的全局 topk 冗余信息，帮助导出一份全局可用的静态使用的冗余专家静态配置。
         if self._ep_w.global_rank_ == 0:
@@ -44,7 +45,7 @@ def prepare_redundancy_experts(
 
         self.redundancy_expert_ids = list(np.argsort(expert_counter)[-self.redundancy_expert_num :])
         logger.info(
-            f"layer_index {self._ep_w.layer_num} global_rank {self._ep_w.global_rank_}"
+            f"layer_index {self._ep_w.layer_num_} global_rank {self._ep_w.global_rank_}"
             f" new select redundancy_expert_ids : {self.redundancy_expert_ids}"
         )
 
@@ -55,7 +56,7 @@ def prepare_redundancy_experts(
         self.experts_gate_proj_scales = [None] * self.redundancy_expert_num
         self.w2_list = [None] * self.redundancy_expert_num
         self.w2_scale_list = [None] * self.redundancy_expert_num
-        self.w1 = [None, None]  # weight, weight_scale
+        self.w13 = [None, None]  # weight, weight_scale
         self.w2 = [None, None]  # weight, weight_scale
         return topk_redundancy_expert_ids
 
@@ -73,13 +74,12 @@ def load_hf_weights(self, weights):
             if w2_weight in weights:
                 self.w2_list[i] = weights[w2_weight]
 
-        if self._ep_w.quantized_weight:
-            self._load_weight_scale(weights)
+        self._load_weight_scale(weights)
         self._fuse()
 
     def _fuse(self):
-        if self._ep_w.quantized_weight:
-            self._fuse_weight_scale()
+        self._fuse_weight_scale()
+
         with self._ep_w.lock:
             if (
                 hasattr(self, "experts_up_projs")
@@ -93,23 +93,38 @@ def _fuse(self):
                 dtype = self.experts_gate_projs[0].dtype
                 total_expert_num = self.redundancy_expert_num
 
-                w1 = torch.empty((total_expert_num, gate_out_dim + up_out_dim, gate_in_dim), dtype=dtype, device="cpu")
+                w13 = torch.empty((total_expert_num, gate_out_dim + up_out_dim, gate_in_dim), dtype=dtype, device="cpu")
 
                 for i_experts in range(self.redundancy_expert_num):
-                    w1[i_experts, 0:gate_out_dim:, :] = self.experts_gate_projs[i_experts]
-                    w1[i_experts, gate_out_dim:, :] = self.experts_up_projs[i_experts]
+                    w13[i_experts, 0:gate_out_dim:, :] = self.experts_gate_projs[i_experts]
+                    w13[i_experts, gate_out_dim:, :] = self.experts_up_projs[i_experts]
 
                 inter_shape, hidden_size = self.w2_list[0].shape[0], self.w2_list[0].shape[1]
                 w2 = torch._utils._flatten_dense_tensors(self.w2_list).view(len(self.w2_list), inter_shape, hidden_size)
-                if not self._ep_w.quantized_weight and self._ep_w.quant_method is not None:
-                    qw1_pack = self._ep_w.quant_method.quantize(w1)
-                    qw2_pack = self._ep_w.quant_method.quantize(w2)
-                    self.w1[0] = qw1_pack.weight
-                    self.w1[1] = qw1_pack.weight_scale
-                    self.w2[0] = qw2_pack.weight
-                    self.w2[1] = qw2_pack.weight_scale
+                if self._ep_w.quant_method._check_weight_need_quanted(weight=w13):
+                    w13_pack, _ = self._ep_w.quant_method.create_moe_weight(
+                        out_dims=[gate_out_dim + up_out_dim],
+                        in_dim=1,
+                        dtype=self._ep_w.data_type_,
+                        device_id=self._ep_w.device_id_,
+                        num_experts=self.redundancy_expert_num,
+                    )
+                    self._ep_w.quant_method.quantize(w13, w13_pack)
+                    w2_pack, _ = self._ep_w.quant_method.create_moe_weight(
+                        out_dims=[inter_shape],
+                        in_dim=hidden_size,
+                        dtype=self._ep_w.data_type_,
+                        device_id=self._ep_w.device_id_,
+                        num_experts=self.redundancy_expert_num,
+                    )
+                    self._ep_w.quant_method.quantize(w2, w2_pack)
+
+                    self.w13[0] = w13_pack.weight
+                    self.w13[1] = w13_pack.weight_scale
+                    self.w2[0] = w2_pack.weight
+                    self.w2[1] = w2_pack.weight_scale
                 else:
-                    self.w1[0] = w1
+                    self.w13[0] = w13
                     self.w2[0] = w2
                 delattr(self, "w2_list")
                 delattr(self, "experts_up_projs")
@@ -128,18 +143,18 @@ def _fuse_weight_scale(self):
                 assert gate_in_dim == up_in_dim
                 dtype = self.experts_gate_proj_scales[0].dtype
                 total_expert_num = self.redundancy_expert_num
-                w1_scale = torch.empty(
+                w13_scale = torch.empty(
                     (total_expert_num, gate_out_dim + up_out_dim, gate_in_dim), dtype=dtype, device="cpu"
                 )
                 for i_experts in range(self.redundancy_expert_num):
-                    w1_scale[i_experts, 0:gate_out_dim:, :] = self.experts_gate_proj_scales[i_experts]
-                    w1_scale[i_experts, gate_out_dim:, :] = self.experts_up_proj_scales[i_experts]
+                    w13_scale[i_experts, 0:gate_out_dim:, :] = self.experts_gate_proj_scales[i_experts]
+                    w13_scale[i_experts, gate_out_dim:, :] = self.experts_up_proj_scales[i_experts]
 
                 inter_shape, hidden_size = self.w2_scale_list[0].shape[0], self.w2_scale_list[0].shape[1]
                 w2_scale = torch._utils._flatten_dense_tensors(self.w2_scale_list).view(
                     len(self.w2_scale_list), inter_shape, hidden_size
                 )
-                self.w1[1] = w1_scale
+                self.w13[1] = w13_scale
                 self.w2[1] = w2_scale
                 delattr(self, "w2_scale_list")
                 delattr(self, "experts_up_proj_scales")
@@ -149,15 +164,10 @@ def _load_weight_scale(self, weights: Dict[str, torch.Tensor]) -> None:
         # 加载冗余专家的scale参数
         for i, redundant_expert_id in enumerate(self.redundancy_expert_ids):
             i_experts = redundant_expert_id
-            w1_scale = (
-                f"{self._ep_w.weight_prefix}.{i_experts}.{self._ep_w.w1_weight_name}.{self._ep_w.weight_scale_suffix}"
-            )
-            w2_scale = (
-                f"{self._ep_w.weight_prefix}.{i_experts}.{self._ep_w.w2_weight_name}.{self._ep_w.weight_scale_suffix}"
-            )
-            w3_scale = (
-                f"{self._ep_w.weight_prefix}.{i_experts}.{self._ep_w.w3_weight_name}.{self._ep_w.weight_scale_suffix}"
-            )
+            weight_scale_suffix = self._ep_w.quant_method.weight_scale_suffix
+            w1_scale = f"{self._ep_w.weight_prefix}.{i_experts}.{self._ep_w.w1_weight_name}.{weight_scale_suffix}"
+            w2_scale = f"{self._ep_w.weight_prefix}.{i_experts}.{self._ep_w.w2_weight_name}.{weight_scale_suffix}"
+            w3_scale = f"{self._ep_w.weight_prefix}.{i_experts}.{self._ep_w.w3_weight_name}.{weight_scale_suffix}"
             if w1_scale in weights:
                 self.experts_gate_proj_scales[i] = weights[w1_scale]
             if w3_scale in weights:
@@ -166,14 +176,14 @@ def _load_weight_scale(self, weights: Dict[str, torch.Tensor]) -> None:
                 self.w2_scale_list[i] = weights[w2_scale]
 
     def commit(self):
-        for index, dest_tensor in enumerate(self._ep_w.w1):
+        for index, dest_tensor in enumerate([self._ep_w.w13.weight, self._ep_w.w13.weight_scale]):
             if dest_tensor is not None:
                 assert isinstance(
                     dest_tensor, torch.Tensor
                 ), f"dest_tensor should be a torch.Tensor, but got {type(dest_tensor)}"
-                dest_tensor[-self.redundancy_expert_num :, :, :] = self.w1[index][:, :, :]
+                dest_tensor[-self.redundancy_expert_num :, :, :] = self.w13[index][:, :, :]
 
-        for index, dest_tensor in enumerate(self._ep_w.w2):
+        for index, dest_tensor in enumerate([self._ep_w.w2.weight, self._ep_w.w2.weight_scale]):
             if dest_tensor is not None:
                 assert isinstance(
                     dest_tensor, torch.Tensor

From 5d09cea1d26828bcaa8be3c5e1b2395fc9c55bdd Mon Sep 17 00:00:00 2001
From: wangzaijun <wangzaijun@sensetime.com>
Date: Tue, 27 Jan 2026 11:04:40 +0000
Subject: [PATCH 57/65] fix

---
 .../layer_weights/meta_weights/fused_moe/fused_moe_weight.py    | 2 +-
 .../layer_weights/meta_weights/fused_moe/impl/base_impl.py      | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/fused_moe_weight.py b/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/fused_moe_weight.py
index 49b8ec8f6..6bcf7fc03 100644
--- a/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/fused_moe_weight.py
+++ b/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/fused_moe_weight.py
@@ -130,7 +130,7 @@ def experts(
         topk_group: int,
         num_expert_group: int,
         is_prefill: Optional[bool] = None,
-    ):
+    ) -> torch.Tensor:
         """Backward compatible method that routes to platform-specific implementation."""
         return self.fuse_moe_impl(
             input_tensor=input_tensor,
diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/impl/base_impl.py b/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/impl/base_impl.py
index c56cd4da3..00587ac18 100644
--- a/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/impl/base_impl.py
+++ b/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/impl/base_impl.py
@@ -62,5 +62,5 @@ def __call__(
         topk_group: int,
         num_expert_group: int,
         is_prefill: Optional[bool] = None,
-    ):
+    ) -> torch.Tensor:
         pass

From 2b46964148d7a25194b586f9a7957e3fe32fcc16 Mon Sep 17 00:00:00 2001
From: wangzaijun <wangzaijun@sensetime.com>
Date: Tue, 27 Jan 2026 11:06:55 +0000
Subject: [PATCH 58/65] fix

---
 .../layer_weights/meta_weights/fused_moe/impl/marlin_impl.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/impl/marlin_impl.py b/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/impl/marlin_impl.py
index bdccbc0ee..6391a1080 100644
--- a/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/impl/marlin_impl.py
+++ b/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/impl/marlin_impl.py
@@ -3,6 +3,9 @@
 from lightllm.common.quantization.quantize_method import (
     WeightPack,
 )
+from lightllm.common.quantization.awq import (
+    AWQMARLINW4A16QuantizationMethod,
+)
 from typing import Optional
 
 
@@ -33,6 +36,8 @@ def _fused_experts(
 
         from vllm.model_executor.layers.fused_moe.fused_marlin_moe import fused_marlin_moe
 
+        self.quant_method: AWQMARLINW4A16QuantizationMethod = self.quant_method
+
         fused_marlin_moe(
             input_tensor,
             w1_weight,

From e3487759ed510b59026094969a27222da9468f1d Mon Sep 17 00:00:00 2001
From: wangzaijun <wangzaijun@sensetime.com>
Date: Tue, 27 Jan 2026 11:27:45 +0000
Subject: [PATCH 59/65] fix

---
 lightllm/common/quantization/awq.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/lightllm/common/quantization/awq.py b/lightllm/common/quantization/awq.py
index 41a4e7685..f3c762397 100644
--- a/lightllm/common/quantization/awq.py
+++ b/lightllm/common/quantization/awq.py
@@ -47,7 +47,7 @@ def __init__(self):
 
         self.cache_manager = g_cache_manager
 
-    def quantize(self, weight: torch.Tensor, output: WeightPack, offset: int = 0):
+    def quantize(self, weight: torch.Tensor, output: WeightPack):
         raise NotImplementedError("AWQ online quantization is not supported yet.")
 
     def apply(
@@ -81,7 +81,7 @@ def __init__(self):
     def method_name(self):
         return "awq"
 
-    def quantize(self, weight: torch.Tensor, output: WeightPack, offset: int = 0):
+    def quantize(self, weight: torch.Tensor, output: WeightPack):
         raise NotImplementedError("AWQ online quantization is not supported yet.")
 
     def apply(
@@ -156,7 +156,7 @@ def __init__(self):
     def method_name(self):
         return "awq_marlin"
 
-    def quantize(self, weight: torch.Tensor, offset: int = 0) -> WeightPack:
+    def quantize(self, weight: torch.Tensor, output: WeightPack):
         raise NotImplementedError("AWQ online quantization is not supported yet.")
 
     def apply(

From 3c1f80f7e78e70bc827482a4061a363d41d7c5e0 Mon Sep 17 00:00:00 2001
From: wangzaijun <wangzaijun@sensetime.com>
Date: Tue, 27 Jan 2026 11:30:57 +0000
Subject: [PATCH 60/65] fix

---
 lightllm/common/quantization/deepgemm.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/lightllm/common/quantization/deepgemm.py b/lightllm/common/quantization/deepgemm.py
index 88877279c..be31f1efa 100644
--- a/lightllm/common/quantization/deepgemm.py
+++ b/lightllm/common/quantization/deepgemm.py
@@ -24,7 +24,7 @@ def __init__(self):
         self.cache_manager = g_cache_manager
         assert HAS_DEEPGEMM, "deepgemm is not installed, you can't use quant api of it"
 
-    def quantize(self, weight: torch.Tensor, output: WeightPack, offset: int = 0):
+    def quantize(self, weight: torch.Tensor, output: WeightPack):
         raise NotImplementedError("Not implemented")
 
     def apply(
@@ -58,13 +58,13 @@ def __init__(self):
     def method_name(self):
         return "deepgemm-fp8w8a8-b128"
 
-    def quantize(self, weight: torch.Tensor, output: WeightPack, offset: int = 0):
+    def quantize(self, weight: torch.Tensor, output: WeightPack):
         from lightllm.common.basemodel.triton_kernel.quantization.fp8w8a8_block_quant_kernel import weight_quant
 
         device = output.weight.device
         weight, scale = weight_quant(weight.cuda(device), self.block_size)
-        output.weight[offset : offset + weight.shape[0], :].copy_(weight)
-        output.weight_scale[offset // self.block_size : offset + weight.shape[0] // self.block_size].copy_(scale)
+        output.weight.copy_(weight)
+        output.weight_scale.copy_(scale)
         return
 
     def apply(

From 7dfb650cc3e344b31505c66030df3cd77c4001ab Mon Sep 17 00:00:00 2001
From: wangzaijun <wangzaijun@sensetime.com>
Date: Tue, 27 Jan 2026 11:35:42 +0000
Subject: [PATCH 61/65] fix

---
 lightllm/common/quantization/w8a8.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lightllm/common/quantization/w8a8.py b/lightllm/common/quantization/w8a8.py
index 9d80e8ab2..98626e1d3 100644
--- a/lightllm/common/quantization/w8a8.py
+++ b/lightllm/common/quantization/w8a8.py
@@ -37,7 +37,7 @@ def __init__(self):
 
         self.cache_manager = g_cache_manager
 
-    def quantize(self, weight: torch.Tensor, output: WeightPack, offset: int = 0) -> None:
+    def quantize(self, weight: torch.Tensor, output: WeightPack) -> None:
         raise NotImplementedError("Not implemented")
 
     def apply(

From 739c479f632a3a04b17437e3a5d0fd29a0c93f75 Mon Sep 17 00:00:00 2001
From: wangzaijun <wangzaijun@sensetime.com>
Date: Tue, 27 Jan 2026 11:38:33 +0000
Subject: [PATCH 62/65] fix bloom.

---
 .../bloom/layer_weights/pre_and_post_layer_weight.py   | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/lightllm/models/bloom/layer_weights/pre_and_post_layer_weight.py b/lightllm/models/bloom/layer_weights/pre_and_post_layer_weight.py
index 000a06912..2a9c86b8f 100644
--- a/lightllm/models/bloom/layer_weights/pre_and_post_layer_weight.py
+++ b/lightllm/models/bloom/layer_weights/pre_and_post_layer_weight.py
@@ -1,5 +1,5 @@
 from lightllm.common.basemodel import PreAndPostLayerWeight
-from lightllm.common.basemodel.layer_weights.meta_weights import EmbeddingWeight, LayerNormWeight
+from lightllm.common.basemodel.layer_weights.meta_weights import EmbeddingWeight, LayerNormWeight, LMHeadWeight
 
 
 class BloomPreAndPostLayerWeight(PreAndPostLayerWeight):
@@ -26,4 +26,10 @@ def __init__(self, data_type, network_config):
             weight_name="word_embeddings.weight",
             data_type=self.data_type_,
         )
-        self.lm_head_weight_ = self.wte_weight_
+        self.lm_head_weight_ = LMHeadWeight(
+            dim=hidden_size,
+            vocab_size=vocab_size,
+            weight_name="word_embeddings.weight",
+            data_type=self.data_type_,
+            embedding_weight=self.wte_weight_,
+        )

From adbe97c1ceb27065443d7a30ac8a067a58861e08 Mon Sep 17 00:00:00 2001
From: shihaobai <1798930569@qq.com>
Date: Tue, 27 Jan 2026 11:45:44 +0000
Subject: [PATCH 63/65] fix qwen3 235b online quant

---
 lightllm/common/quantization/deepgemm.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/lightllm/common/quantization/deepgemm.py b/lightllm/common/quantization/deepgemm.py
index be31f1efa..878815edb 100644
--- a/lightllm/common/quantization/deepgemm.py
+++ b/lightllm/common/quantization/deepgemm.py
@@ -101,13 +101,15 @@ def _create_weight(
         self, out_dims: Union[int, List[int]], in_dim: int, dtype: torch.dtype, device_id: int, num_experts: int = 1
     ) -> Tuple[WeightPack, List[WeightPack]]:
         out_dim = sum(out_dims) if isinstance(out_dims, list) else out_dims
+        weight_scale_out_dims = [(_out_dim + self.block_size - 1) // self.block_size for _out_dim in out_dims]
+        weight_scale_out_dim = sum(weight_scale_out_dims)
+        weight_scale_in_dim = (in_dim + self.block_size - 1) // self.block_size
         expert_prefix = (num_experts,) if num_experts > 1 else ()
         weight = torch.empty(expert_prefix + (out_dim, in_dim), dtype=torch.float8_e4m3fn).cuda(device_id)
-        scale_out_dim = (out_dim + self.block_size - 1) // self.block_size
-        scale_in_dim = (in_dim + self.block_size - 1) // self.block_size
-        weight_scale = torch.empty(expert_prefix + (scale_out_dim, scale_in_dim), dtype=torch.float32).cuda(device_id)
+        weight_scale = torch.empty(
+            expert_prefix + (weight_scale_out_dim, weight_scale_in_dim), dtype=torch.float32
+        ).cuda(device_id)
         mm_param = WeightPack(weight=weight, weight_scale=weight_scale)
-        weight_scale_out_dims = [(_out_dim + self.block_size - 1) // self.block_size for _out_dim in out_dims]
         mm_param_list = self._split_weight_pack(
             mm_param,
             weight_out_dims=out_dims,

From 577e08f015a0a0196dfbee52defd7884707f30f4 Mon Sep 17 00:00:00 2001
From: shihaobai <1798930569@qq.com>
Date: Tue, 27 Jan 2026 12:04:53 +0000
Subject: [PATCH 64/65] add assert for 235 tp=8, deepgemm

---
 lightllm/common/quantization/deepgemm.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/lightllm/common/quantization/deepgemm.py b/lightllm/common/quantization/deepgemm.py
index 878815edb..137455a82 100644
--- a/lightllm/common/quantization/deepgemm.py
+++ b/lightllm/common/quantization/deepgemm.py
@@ -102,6 +102,12 @@ def _create_weight(
     ) -> Tuple[WeightPack, List[WeightPack]]:
         out_dim = sum(out_dims) if isinstance(out_dims, list) else out_dims
         weight_scale_out_dims = [(_out_dim + self.block_size - 1) // self.block_size for _out_dim in out_dims]
+        divisible_by_block_size = [_out_dim % self.block_size != 0 for _out_dim in out_dims]
+        if sum(divisible_by_block_size) > 1:
+            raise ValueError(
+                f"out_dims only contains one dim can not be divisible \
+                by block_size {self.block_size}, but got {out_dims}"
+            )
         weight_scale_out_dim = sum(weight_scale_out_dims)
         weight_scale_in_dim = (in_dim + self.block_size - 1) // self.block_size
         expert_prefix = (num_experts,) if num_experts > 1 else ()

From 795ab5643a034c82de9cf7e935b60e222a673538 Mon Sep 17 00:00:00 2001
From: wangzaijun <wangzaijun@sensetime.com>
Date: Tue, 27 Jan 2026 12:05:47 +0000
Subject: [PATCH 65/65] fix

---
 lightllm/models/gemma_2b/layer_infer/pre_layer_infer.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/lightllm/models/gemma_2b/layer_infer/pre_layer_infer.py b/lightllm/models/gemma_2b/layer_infer/pre_layer_infer.py
index e21788d76..a25c5af4c 100644
--- a/lightllm/models/gemma_2b/layer_infer/pre_layer_infer.py
+++ b/lightllm/models/gemma_2b/layer_infer/pre_layer_infer.py
@@ -13,8 +13,6 @@ class Gemma_2bPreLayerInfer(PreLayerInferTpl):
 
     def __init__(self, network_config):
         super().__init__(network_config)
-        tp_vob_ids = np.linspace(0, network_config["vocab_size"], self.tp_world_size_ + 1, dtype=np.int64)
-        self.vob_start_id_, self.vob_end_id_ = int(tp_vob_ids[self.tp_rank_]), int(tp_vob_ids[self.tp_rank_ + 1])
         self.normfactor = network_config["hidden_size"] ** 0.5
         return