Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
76 changes: 35 additions & 41 deletions auto_round/inference/backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,11 +47,11 @@ class BackendInfo:
Attributes:
device: A list of strings representing the devices the backend supports
(e.g., 'cuda', 'cpu').
(e.g., 'cpu', 'xpu', 'cuda').
sym: A list of booleans indicating whether the backend supports symmetric
quantization for weights (True if symmetric, False if not).
packing_format: A list of strings representing the packing formats used by the backend
(e.g., 'triton', 'qbits').
(e.g., 'ark', 'triton').
bits: A list of integers specifying the bit-widths supported by the backend
for weight quantization (e.g., [2, 4, 8]).
group_size: An optional list of integers specifying the group sizes supported
Expand Down Expand Up @@ -430,51 +430,51 @@ def fp8_static_scheme_checker(
requirements=["autoawq", "transformers<4.57.0"],
)

BackendInfos["qbits"] = BackendInfo(
device=["cpu"],
sym=[True, False],
BackendInfos["auto_round_kernel"] = BackendInfo(
device=["cpu", "xpu"],
sym=[True],
packing_format=GPTQ_FORMAT_NO_ZP,
bits=[2, 4, 8],
group_size=None,
priority=1,
priority=0,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

set all the priorities to 6 for ark

checkers=[],
alias=["itrex", "qbits"],
compute_dtype=["float16", "bfloat16"],
alias=["ark"],
compute_dtype=["float32", "float16"],
data_type=["int"],
act_bits=WOQ_DEFAULT_ACT_BITS,
requirements=["torch<2.7.0", "intel-extension-for-transformers"],
requirements=["torch>=2.9.0"],
)

BackendInfos["qbits_zp"] = BackendInfo(
device=["cpu"],
sym=[True, False],
BackendInfos["auto_round_kernel_zp"] = BackendInfo(
device=["cpu", "xpu"],
sym=[True],
packing_format=GPTQ_FORMAT,
bits=[2, 4, 8],
group_size=None,
compute_dtype=["float16", "bfloat16"],
priority=0,
checkers=[],
alias=["ark"],
compute_dtype=["float32", "float16"],
data_type=["int"],
act_bits=WOQ_DEFAULT_ACT_BITS,
priority=1,
checkers=[],
alias=["itrex", "qbits"],
requirements=["torch<2.7.0", "intel-extension-for-transformers"],
requirements=["torch>=2.9.0"],
)


BackendInfos["qbits_awq"] = BackendInfo(
BackendInfos["auto_round_kernel_awq"] = BackendInfo(
device=["cpu"],
sym=[True, False],
packing_format=AWQ_FORMAT,
bits=[2, 4, 8],
group_size=None,
compute_dtype=["float16", "bfloat16"],
priority=0,
checkers=[],
alias=["ark"],
compute_dtype=["float32", "float16"],
data_type=["int"],
act_bits=WOQ_DEFAULT_ACT_BITS,
priority=1,
checkers=[],
alias=["itrex", "qbits"],
requirements=["torch<2.7.0", "intel-extension-for-transformers"],
requirements=["torch>=2.9.0"],
)

BackendInfos["ipex_gptq"] = BackendInfo(
device=["cpu", "xpu"],
sym=[True, False],
Expand Down Expand Up @@ -601,12 +601,12 @@ def dynamic_import_inference_linear(backend, config):
"""Dynamically imports and returns the appropriate QuantLinear class based on the given backend.
This function dynamically loads the correct `QuantLinear` class based on the backend and quantization
configuration (e.g., qbits, marlin, hpu, gptq, awq, auto_round). It imports specific modules or raises
configuration (e.g., ark, marlin, hpu, gptq, awq). It imports specific modules or raises
errors if the required packages are not installed or the environment is not set up.
Args:
backend (str):
The backend to be used for quantization (e.g., 'qbits', 'marlin', 'hpu', 'gptq', 'awq', 'auto_round').
The backend to be used for quantization (e.g., 'ark', 'marlin', 'hpu', 'gptq', 'awq').
config (QuantizationScheme):
The quantization configuration containing parameters like bits, group_size, and sym.
Expand All @@ -616,7 +616,7 @@ def dynamic_import_inference_linear(backend, config):
Raises:
ImportError:
If required modules are missing for a backend (e.g., Intel Extension, GPTQ, auto_awq).
If required modules are missing for a backend (e.g., ark, GPTQ, auto_awq).
"""
bits, group_size, sym = config["bits"], config["group_size"], config["sym"]

Expand All @@ -629,26 +629,20 @@ def dynamic_import_inference_linear(backend, config):
if "torch_nvfp4" in backend:
return ar_qmodules.NVFP4QuantLinear

if "qbits" in backend:
if "auto_round_kernel" in backend or "ark" in backend:
try:
from intel_extension_for_transformers import qbits # pylint: disable=E0401
import auto_round_kernel as ark # pylint: disable=E0401
except Exception as e:
raise ImportError(
"Please install Intel Extension for Transformers via 'pip install "
"intel-extension-for-transformers' to inference on X86 CPU"
)
if "zp" in backend:
import auto_round_extension.qbits.qlinear_qbits_gptq as qlinear_qbits_gptq
raise ImportError("Please install auto_round_kernel version for CPU/XPU")
import auto_round_extension.kernel.qlinear as qlinear

return qlinear_qbits_gptq.QuantLinear
if "zp" in backend:
return qlinear.QuantLinearGPTQ
elif "awq" in backend:
import auto_round_extension.qbits.qbits_awq as qlinear_qbits_awq

return qlinear_qbits_awq.QuantLinear
return qlinear.QuantLinearAWQ
else: # auto_round must be at the end
import auto_round_extension.qbits.qlinear_qbits as qlinear_qbits_autoround
return qlinear.QuantLinear

return qlinear_qbits_autoround.QuantLinear
if "ipex_gptq" in backend:
from auto_round_extension.ipex.qlinear_ipex_gptq import QuantLinear

Expand Down
13 changes: 4 additions & 9 deletions auto_round/inference/convert_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,6 @@
from auto_round.inference.utils import _expand_regex_config
from auto_round.logger import logger
from auto_round.schemes import QuantizationScheme
from auto_round.special_model_handler import _handle_moe_model
from auto_round.utils import (
SUPPORTED_LAYER_TYPES,
check_start_with_block_name,
Expand Down Expand Up @@ -395,9 +394,9 @@ def _create_quant_layer(layer, layer_backend, config, in_features, out_features)
bias = layer.bias is not None

# Special handling for AWQ layers
from auto_round_extension.qbits.qbits_awq import QuantLinear as QBitsAWQQuantLinear
from auto_round_extension.ark.qlinear import QuantLinearAWQ

if "awq" in layer_backend and isinstance(QuantLinear, QBitsAWQQuantLinear):
if "awq" in layer_backend and isinstance(QuantLinear, QuantLinearAWQ):
return QuantLinear.from_linear(
layer, config["bits"], config["group_size"], init_only=True, has_zero_points=not config["sym"]
)
Expand Down Expand Up @@ -474,7 +473,6 @@ def post_init(model: torch.nn.Module, used_backends: list[str]) -> None:
need_gptqmodel_init = False
need_ipex_itrex_init = False
used_gptq_exllamav2 = False

# Determine which backends require post-init
for backend in used_backends:
if backend.startswith("auto_gptq"):
Expand All @@ -483,7 +481,7 @@ def post_init(model: torch.nn.Module, used_backends: list[str]) -> None:
used_gptq_exllamav2 = True
elif backend.startswith("gptqmodel"):
need_gptqmodel_init = True
elif backend.startswith(("ipex", "qbit")):
elif backend.startswith(("ipex", "auto_round_kernel")):
need_ipex_itrex_init = True

# AutoGPTQ post-init
Expand All @@ -503,7 +501,7 @@ def post_init(model: torch.nn.Module, used_backends: list[str]) -> None:
message = "repacking to CPU/XPU format"
layers = [] ## ipex post_init will add one more layer
for n, m in model.named_modules():
if hasattr(m, "QUANT_TYPE") and ("qbits" in m.QUANT_TYPE or "ipex" in m.QUANT_TYPE):
if hasattr(m, "QUANT_TYPE") and ("ark" in m.QUANT_TYPE or "ipex" in m.QUANT_TYPE):
layers.append(m)

for layer in tqdm(layers, desc=message, total=len(layers), leave=True):
Expand Down Expand Up @@ -583,9 +581,6 @@ def convert_hf_model(model: nn.Module, target_device: str = "cpu") -> tuple[nn.M
elif packing_format == "auto_round:gptq":
packing_format = "auto_round:auto_gptq"

# Preprocess model before replace layers
model = _handle_moe_model(model)

# Replace layers with quantized versions
layer_configs = get_layer_config(model, quantization_config)
used_backends = _replace_by_quant_layers(model, layer_configs, backend, target_device, packing_format)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,8 @@
# See the License for the specific language governing permissions and
# limitations under the License.

from auto_round_extension.qbits.qlinear_qbits import QuantLinear as QBitsQuantLinear
from auto_round_extension.qbits.qlinear_qbits_gptq import (
QuantLinear as QBitsGPTQQuantLinear,
)
from auto_round_extension.qbits.qbits_awq import QuantLinear as QBitsAWQQuantLinear
from auto_round_extension.ark.qlinear import QuantLinear, QuantLinearGPTQ, QuantLinearAWQ

qbits_qlinear_classes = (QBitsQuantLinear, QBitsGPTQQuantLinear)
qlinear_classes = (QuantLinear, QuantLinearGPTQ)

qbits_awq_classes = (QBitsAWQQuantLinear,)
awq_classes = (QuantLinearAWQ,)
Loading
Loading