intel · Zhenzhong1 · Nov 27, 2025 · Nov 27, 2025 · Nov 27, 2025 · Nov 27, 2025
diff --git a/auto_round/inference/backend.py b/auto_round/inference/backend.py
@@ -47,11 +47,11 @@ class BackendInfo:
 
     Attributes:
         device: A list of strings representing the devices the backend supports
-            (e.g., 'cuda', 'cpu').
+            (e.g., 'cpu', 'xpu', 'cuda').
         sym: A list of booleans indicating whether the backend supports symmetric
             quantization for weights (True if symmetric, False if not).
         packing_format: A list of strings representing the packing formats used by the backend
-            (e.g., 'triton', 'qbits').
+            (e.g., 'ark', 'triton').
         bits: A list of integers specifying the bit-widths supported by the backend
             for weight quantization (e.g., [2, 4, 8]).
         group_size: An optional list of integers specifying the group sizes supported
@@ -430,51 +430,51 @@ def fp8_static_scheme_checker(
     requirements=["autoawq", "transformers<4.57.0"],
 )
 
-BackendInfos["qbits"] = BackendInfo(
-    device=["cpu"],
-    sym=[True, False],
+BackendInfos["auto_round_kernel"] = BackendInfo(
+    device=["cpu", "xpu"],
+    sym=[True],
     packing_format=GPTQ_FORMAT_NO_ZP,
     bits=[2, 4, 8],
     group_size=None,
-    priority=1,
+    priority=0,
     checkers=[],
-    alias=["itrex", "qbits"],
-    compute_dtype=["float16", "bfloat16"],
+    alias=["ark"],
+    compute_dtype=["float32", "float16"],
     data_type=["int"],
     act_bits=WOQ_DEFAULT_ACT_BITS,
-    requirements=["torch<2.7.0", "intel-extension-for-transformers"],
+    requirements=["torch>=2.9.0"],
 )
 
-BackendInfos["qbits_zp"] = BackendInfo(
-    device=["cpu"],
-    sym=[True, False],
+BackendInfos["auto_round_kernel_zp"] = BackendInfo(
+    device=["cpu", "xpu"],
+    sym=[True],
     packing_format=GPTQ_FORMAT,
     bits=[2, 4, 8],
     group_size=None,
-    compute_dtype=["float16", "bfloat16"],
+    priority=0,
+    checkers=[],
+    alias=["ark"],
+    compute_dtype=["float32", "float16"],
     data_type=["int"],
     act_bits=WOQ_DEFAULT_ACT_BITS,
-    priority=1,
-    checkers=[],
-    alias=["itrex", "qbits"],
-    requirements=["torch<2.7.0", "intel-extension-for-transformers"],
+    requirements=["torch>=2.9.0"],
 )
 
-
-BackendInfos["qbits_awq"] = BackendInfo(
+BackendInfos["auto_round_kernel_awq"] = BackendInfo(
     device=["cpu"],
     sym=[True, False],
     packing_format=AWQ_FORMAT,
     bits=[2, 4, 8],
     group_size=None,
-    compute_dtype=["float16", "bfloat16"],
+    priority=0,
+    checkers=[],
+    alias=["ark"],
+    compute_dtype=["float32", "float16"],
     data_type=["int"],
     act_bits=WOQ_DEFAULT_ACT_BITS,
-    priority=1,
-    checkers=[],
-    alias=["itrex", "qbits"],
-    requirements=["torch<2.7.0", "intel-extension-for-transformers"],
+    requirements=["torch>=2.9.0"],
 )
+
 BackendInfos["ipex_gptq"] = BackendInfo(
     device=["cpu", "xpu"],
     sym=[True, False],
@@ -601,12 +601,12 @@ def dynamic_import_inference_linear(backend, config):
     """Dynamically imports and returns the appropriate QuantLinear class based on the given backend.
 
     This function dynamically loads the correct `QuantLinear` class based on the backend and quantization
-    configuration (e.g., qbits, marlin, hpu, gptq, awq, auto_round). It imports specific modules or raises
+    configuration (e.g., ark, marlin, hpu, gptq, awq). It imports specific modules or raises
     errors if the required packages are not installed or the environment is not set up.
 
     Args:
         backend (str):
-            The backend to be used for quantization (e.g., 'qbits', 'marlin', 'hpu', 'gptq', 'awq', 'auto_round').
+            The backend to be used for quantization (e.g., 'ark', 'marlin', 'hpu', 'gptq', 'awq').
         config (QuantizationScheme):
             The quantization configuration containing parameters like bits, group_size, and sym.
 
@@ -616,7 +616,7 @@ def dynamic_import_inference_linear(backend, config):
 
     Raises:
         ImportError:
-            If required modules are missing for a backend (e.g., Intel Extension, GPTQ, auto_awq).
+            If required modules are missing for a backend (e.g., ark, GPTQ, auto_awq).
     """
     bits, group_size, sym = config["bits"], config["group_size"], config["sym"]
 
@@ -629,26 +629,20 @@ def dynamic_import_inference_linear(backend, config):
     if "torch_nvfp4" in backend:
         return ar_qmodules.NVFP4QuantLinear
 
-    if "qbits" in backend:
+    if "auto_round_kernel" in backend or "ark" in backend:
         try:
-            from intel_extension_for_transformers import qbits  # pylint: disable=E0401
+            import auto_round_kernel as ark  # pylint: disable=E0401
         except Exception as e:
-            raise ImportError(
-                "Please install Intel Extension for Transformers via 'pip install "
-                "intel-extension-for-transformers' to inference on X86 CPU"
-            )
-        if "zp" in backend:
-            import auto_round_extension.qbits.qlinear_qbits_gptq as qlinear_qbits_gptq
+            raise ImportError("Please install auto_round_kernel version for CPU/XPU")
+        import auto_round_extension.kernel.qlinear as qlinear
 
-            return qlinear_qbits_gptq.QuantLinear
+        if "zp" in backend:
+            return qlinear.QuantLinearGPTQ
         elif "awq" in backend:
-            import auto_round_extension.qbits.qbits_awq as qlinear_qbits_awq
-
-            return qlinear_qbits_awq.QuantLinear
+            return qlinear.QuantLinearAWQ
         else:  # auto_round must be at the end
-            import auto_round_extension.qbits.qlinear_qbits as qlinear_qbits_autoround
+            return qlinear.QuantLinear
 
-            return qlinear_qbits_autoround.QuantLinear
     if "ipex_gptq" in backend:
         from auto_round_extension.ipex.qlinear_ipex_gptq import QuantLinear
 

diff --git a/auto_round/inference/convert_model.py b/auto_round/inference/convert_model.py
@@ -31,7 +31,6 @@
 from auto_round.inference.utils import _expand_regex_config
 from auto_round.logger import logger
 from auto_round.schemes import QuantizationScheme
-from auto_round.special_model_handler import _handle_moe_model
 from auto_round.utils import (
     SUPPORTED_LAYER_TYPES,
     check_start_with_block_name,
@@ -395,9 +394,9 @@ def _create_quant_layer(layer, layer_backend, config, in_features, out_features)
     bias = layer.bias is not None
 
     # Special handling for AWQ layers
-    from auto_round_extension.qbits.qbits_awq import QuantLinear as QBitsAWQQuantLinear
+    from auto_round_extension.ark.qlinear import QuantLinearAWQ
 
-    if "awq" in layer_backend and isinstance(QuantLinear, QBitsAWQQuantLinear):
+    if "awq" in layer_backend and isinstance(QuantLinear, QuantLinearAWQ):
         return QuantLinear.from_linear(
             layer, config["bits"], config["group_size"], init_only=True, has_zero_points=not config["sym"]
         )
@@ -474,7 +473,6 @@ def post_init(model: torch.nn.Module, used_backends: list[str]) -> None:
     need_gptqmodel_init = False
     need_ipex_itrex_init = False
     used_gptq_exllamav2 = False
-
     # Determine which backends require post-init
     for backend in used_backends:
         if backend.startswith("auto_gptq"):
@@ -483,7 +481,7 @@ def post_init(model: torch.nn.Module, used_backends: list[str]) -> None:
                 used_gptq_exllamav2 = True
         elif backend.startswith("gptqmodel"):
             need_gptqmodel_init = True
-        elif backend.startswith(("ipex", "qbit")):
+        elif backend.startswith(("ipex", "auto_round_kernel")):
             need_ipex_itrex_init = True
 
     # AutoGPTQ post-init
@@ -503,7 +501,7 @@ def post_init(model: torch.nn.Module, used_backends: list[str]) -> None:
         message = "repacking to CPU/XPU format"
         layers = []  ## ipex post_init  will add one more layer
         for n, m in model.named_modules():
-            if hasattr(m, "QUANT_TYPE") and ("qbits" in m.QUANT_TYPE or "ipex" in m.QUANT_TYPE):
+            if hasattr(m, "QUANT_TYPE") and ("ark" in m.QUANT_TYPE or "ipex" in m.QUANT_TYPE):
                 layers.append(m)
 
         for layer in tqdm(layers, desc=message, total=len(layers), leave=True):
@@ -583,9 +581,6 @@ def convert_hf_model(model: nn.Module, target_device: str = "cpu") -> tuple[nn.M
     elif packing_format == "auto_round:gptq":
         packing_format = "auto_round:auto_gptq"
 
-    # Preprocess model before replace layers
-    model = _handle_moe_model(model)
-
     # Replace layers with quantized versions
     layer_configs = get_layer_config(model, quantization_config)
     used_backends = _replace_by_quant_layers(model, layer_configs, backend, target_device, packing_format)

diff --git a/auto_round_extension/qbits/__init__.py → auto_round_extension/ark/__init__.py b/auto_round_extension/qbits/__init__.py → auto_round_extension/ark/__init__.py
@@ -12,12 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from auto_round_extension.qbits.qlinear_qbits import QuantLinear as QBitsQuantLinear
-from auto_round_extension.qbits.qlinear_qbits_gptq import (
-    QuantLinear as QBitsGPTQQuantLinear,
-)
-from auto_round_extension.qbits.qbits_awq import QuantLinear as QBitsAWQQuantLinear
+from auto_round_extension.ark.qlinear import QuantLinear, QuantLinearGPTQ, QuantLinearAWQ
 
-qbits_qlinear_classes = (QBitsQuantLinear, QBitsGPTQQuantLinear)
+qlinear_classes = (QuantLinear, QuantLinearGPTQ)
 
-qbits_awq_classes = (QBitsAWQQuantLinear,)
+awq_classes = (QuantLinearAWQ,)