跳到内容

vllm_gaudi.ops.hpu_awq

AWQHPUConfig

基类: QuantizationConfig

AWQ 的配置类。

参考: https://arxiv.org/abs/2306.00978

源代码在 vllm_gaudi/ops/hpu_awq.py
@register_quantization_config("awq_hpu")
class AWQHPUConfig(QuantizationConfig):
    """Config class for AWQ.

    Reference: https://arxiv.org/abs/2306.00978
    """

    def __init__(
        self,
        weight_bits: int,
        group_size: int,
        zero_point: bool,
        modules_to_not_convert: Optional[list[str]] = None,
    ) -> None:
        self.weight_bits = weight_bits
        self.group_size = group_size
        self.zero_point = zero_point
        self.modules_to_not_convert = modules_to_not_convert or []

        if self.weight_bits != 4:
            raise ValueError("Currently, only 4-bit weight quantization is supported for "
                             f"AWQ, but got {self.weight_bits} bits.")
        self.pack_factor = 32 // self.weight_bits

    def __repr__(self) -> str:
        return (f"AWQConfig(weight_bits={self.weight_bits}, "
                f"group_size={self.group_size}, "
                f"zero_point={self.zero_point}),"
                f"modules_to_not_convert={self.modules_to_not_convert})")

    def get_name(self) -> str:
        return "awq_hpu"

    def get_supported_act_dtypes(self) -> list[torch.dtype]:
        return [torch.bfloat16]

    @classmethod
    def get_min_capability(cls) -> int:
        # The AWQ kernel only supports Turing or newer GPUs.
        return 0

    @staticmethod
    def get_config_filenames() -> list[str]:
        return [
            "quant_config.json",  # E.g., casperhansen/vicuna-7b-v1.5-awq
            # E.g., abhinavkulkarni/mosaicml-mpt-7b-instruct-w4-g128-awq
            "quantize_config.json",
        ]

    @classmethod
    def from_config(cls, config: dict[str, Any]) -> "AWQHPUConfig":
        weight_bits = cls.get_from_keys(config, ["w_bit", "bits"])
        group_size = cls.get_from_keys(config, ["q_group_size", "group_size"])
        zero_point = cls.get_from_keys(config, ["zero_point"])
        modules_to_not_convert = cls.get_from_keys_or(config, ["modules_to_not_convert"], None)
        return cls(weight_bits, group_size, zero_point, modules_to_not_convert)

    @classmethod
    def override_quantization_method(cls, hf_quant_cfg, user_quant) -> Optional[str]:

        is_valid_user_quant = user_quant == "awq_hpu"

        if is_valid_user_quant:
            instance = cls(weight_bits=4, group_size=128, zero_point=True)
            return instance.get_name()

        return None

    def get_quant_method(self, layer: torch.nn.Module, prefix: str) -> Optional["AWQHPULinearMethod"]:
        LinearBase, _, UnquantizedLinearMethod = get_linear_classes()
        if isinstance(layer, LinearBase):
            if is_layer_skipped_awq(prefix, self.modules_to_not_convert):
                return UnquantizedLinearMethod()
            return AWQHPULinearMethod(self)
        return None

    def get_scaled_act_names(self) -> list[str]:
        return ["gelu", "gelu_fast", "gelu_new", "gelu_pytorch_tanh"]

group_size 实例属性

group_size = group_size

modules_to_not_convert 实例属性

modules_to_not_convert = modules_to_not_convert or []

pack_factor 实例属性

pack_factor = 32 // weight_bits

weight_bits 实例属性

weight_bits = weight_bits

zero_point 实例属性

zero_point = zero_point

__init__

__init__(
    weight_bits: int,
    group_size: int,
    zero_point: bool,
    modules_to_not_convert: Optional[list[str]] = None,
) -> None
源代码在 vllm_gaudi/ops/hpu_awq.py
def __init__(
    self,
    weight_bits: int,
    group_size: int,
    zero_point: bool,
    modules_to_not_convert: Optional[list[str]] = None,
) -> None:
    self.weight_bits = weight_bits
    self.group_size = group_size
    self.zero_point = zero_point
    self.modules_to_not_convert = modules_to_not_convert or []

    if self.weight_bits != 4:
        raise ValueError("Currently, only 4-bit weight quantization is supported for "
                         f"AWQ, but got {self.weight_bits} bits.")
    self.pack_factor = 32 // self.weight_bits

__repr__

__repr__() -> str
源代码在 vllm_gaudi/ops/hpu_awq.py
def __repr__(self) -> str:
    return (f"AWQConfig(weight_bits={self.weight_bits}, "
            f"group_size={self.group_size}, "
            f"zero_point={self.zero_point}),"
            f"modules_to_not_convert={self.modules_to_not_convert})")

from_config 类方法

from_config(config: dict[str, Any]) -> AWQHPUConfig
源代码在 vllm_gaudi/ops/hpu_awq.py
@classmethod
def from_config(cls, config: dict[str, Any]) -> "AWQHPUConfig":
    weight_bits = cls.get_from_keys(config, ["w_bit", "bits"])
    group_size = cls.get_from_keys(config, ["q_group_size", "group_size"])
    zero_point = cls.get_from_keys(config, ["zero_point"])
    modules_to_not_convert = cls.get_from_keys_or(config, ["modules_to_not_convert"], None)
    return cls(weight_bits, group_size, zero_point, modules_to_not_convert)

get_config_filenames 静态方法

get_config_filenames() -> list[str]
源代码在 vllm_gaudi/ops/hpu_awq.py
@staticmethod
def get_config_filenames() -> list[str]:
    return [
        "quant_config.json",  # E.g., casperhansen/vicuna-7b-v1.5-awq
        # E.g., abhinavkulkarni/mosaicml-mpt-7b-instruct-w4-g128-awq
        "quantize_config.json",
    ]

get_min_capability 类方法

get_min_capability() -> int
源代码在 vllm_gaudi/ops/hpu_awq.py
@classmethod
def get_min_capability(cls) -> int:
    # The AWQ kernel only supports Turing or newer GPUs.
    return 0

get_name

get_name() -> str
源代码在 vllm_gaudi/ops/hpu_awq.py
def get_name(self) -> str:
    return "awq_hpu"

get_quant_method

get_quant_method(
    layer: Module, prefix: str
) -> Optional[AWQHPULinearMethod]
源代码在 vllm_gaudi/ops/hpu_awq.py
def get_quant_method(self, layer: torch.nn.Module, prefix: str) -> Optional["AWQHPULinearMethod"]:
    LinearBase, _, UnquantizedLinearMethod = get_linear_classes()
    if isinstance(layer, LinearBase):
        if is_layer_skipped_awq(prefix, self.modules_to_not_convert):
            return UnquantizedLinearMethod()
        return AWQHPULinearMethod(self)
    return None

get_scaled_act_names

get_scaled_act_names() -> list[str]
源代码在 vllm_gaudi/ops/hpu_awq.py
def get_scaled_act_names(self) -> list[str]:
    return ["gelu", "gelu_fast", "gelu_new", "gelu_pytorch_tanh"]

get_supported_act_dtypes

get_supported_act_dtypes() -> list[dtype]
源代码在 vllm_gaudi/ops/hpu_awq.py
def get_supported_act_dtypes(self) -> list[torch.dtype]:
    return [torch.bfloat16]

override_quantization_method 类方法

override_quantization_method(
    hf_quant_cfg, user_quant
) -> Optional[str]
源代码在 vllm_gaudi/ops/hpu_awq.py
@classmethod
def override_quantization_method(cls, hf_quant_cfg, user_quant) -> Optional[str]:

    is_valid_user_quant = user_quant == "awq_hpu"

    if is_valid_user_quant:
        instance = cls(weight_bits=4, group_size=128, zero_point=True)
        return instance.get_name()

    return None

AWQHPULinearMethod

AWQ 的线性方法。

参数

名称 类型 描述 默认值
quant_config AWQHPUConfig

AWQ 量化配置。

required
源代码在 vllm_gaudi/ops/hpu_awq.py
class AWQHPULinearMethod:
    """Linear method for AWQ.

    Args:
        quant_config: The AWQ quantization config.
    """

    def __init__(self, quant_config: AWQHPUConfig):
        _, LinearMethodBase, _ = get_linear_classes()
        if not issubclass(self.__class__, LinearMethodBase):
            self.__class__ = type(
                self.__class__.__name__,
                (self.__class__, LinearMethodBase),
                dict(self.__class__.__dict__),
            )
        self.quant_config = quant_config

    def create_weights(self, layer: torch.nn.Module, input_size_per_partition: int, output_partition_sizes: list[int],
                       input_size: int, output_size: int, params_dtype: torch.dtype, **extra_weight_attrs):

        (GroupQuantScaleParameter, PackedvLLMParameter) = get_parameter_classes()
        if input_size_per_partition % self.quant_config.group_size != 0:
            raise ValueError("The input size is not aligned with the quantized "
                             "weight shape. This can be caused by too large "
                             "tensor parallel size.")

        output_size_per_partition = sum(output_partition_sizes)
        if output_size_per_partition % self.quant_config.pack_factor != 0:
            raise ValueError("The output size is not aligned with the quantized "
                             "weight shape. This can be caused by too large "
                             "tensor parallel size.")

        weight_loader = extra_weight_attrs.get("weight_loader")
        qweight = PackedvLLMParameter(data=torch.empty(
            input_size_per_partition,
            output_size_per_partition // self.quant_config.pack_factor,
            dtype=torch.int32,
        ),
                                      input_dim=0,
                                      output_dim=1,
                                      packed_dim=1,
                                      packed_factor=self.quant_config.pack_factor,
                                      weight_loader=weight_loader)

        qzeros = PackedvLLMParameter(data=torch.empty(
            input_size_per_partition // self.quant_config.group_size,
            output_size_per_partition // self.quant_config.pack_factor,
            dtype=torch.int32,
        ),
                                     input_dim=0,
                                     output_dim=1,
                                     packed_dim=1,
                                     packed_factor=self.quant_config.pack_factor,
                                     weight_loader=weight_loader)

        scales = GroupQuantScaleParameter(data=torch.empty(
            input_size_per_partition // self.quant_config.group_size,
            output_size_per_partition,
            dtype=params_dtype,
        ),
                                          input_dim=0,
                                          output_dim=1,
                                          weight_loader=weight_loader)

        qzeros.pack_factor = self.quant_config.pack_factor
        qweight.pack_factor = self.quant_config.pack_factor

        layer.register_parameter("qweight", qweight)
        layer.register_parameter("qzeros", qzeros)
        layer.register_parameter("scales", scales)

    def pack_tensor(self, x):
        wf = torch.tensor(list(range(0, 32, self.quant_config.weight_bits)), dtype=torch.int32).unsqueeze(0)
        xp = torch.sum(torch.bitwise_left_shift(x.reshape(x.shape[0], -1, (32 // self.quant_config.weight_bits)),
                                                wf.unsqueeze(0)),
                       dim=-1).to(torch.int32)
        return xp

    def unpack_tensor(self, xp):
        wf = torch.tensor(list(range(0, 32, self.quant_config.weight_bits)), dtype=torch.int32).unsqueeze(0)
        x = torch.bitwise_right_shift(
            torch.unsqueeze(xp, -1).expand(xp.shape[0], -1, 32 // self.quant_config.weight_bits),
            wf.unsqueeze(0)).to(torch.int8)
        x = torch.bitwise_and(x, (2**self.quant_config.weight_bits) - 1)
        x = x.reshape((x.shape[0], -1))
        return x

    def awq_order(self, x):

        order = [0, 4, 1, 5, 2, 6, 3, 7]
        idx = torch.arange(
            x.shape[-1],
            dtype=torch.int32,
            device=x.device,
        )
        idx = idx.view(-1, 32 // self.quant_config.weight_bits)
        idx = idx[:, order]
        idx = idx.view(-1)

        x = x[:, idx]
        return x

    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:

        # This unpacking-packing is required because HPU dequant kernel
        # is not compatible with AWQ format
        device = layer.qweight.device
        wq = layer.qweight.cpu()
        zq = layer.qzeros.cpu()
        wqu = self.awq_order(self.unpack_tensor(wq))
        zu = self.awq_order(self.unpack_tensor(zq))
        layer.qweight.data = self.pack_tensor(wqu).to(device)
        layer.qzeros.data = self.pack_tensor(zu).to(device)

        layer.qweight = torch.nn.Parameter(layer.qweight.data, requires_grad=False)
        layer.qzeros = torch.nn.Parameter(layer.qzeros.data, requires_grad=False)
        layer.scales = torch.nn.Parameter(layer.scales.data, requires_grad=False)

    def apply(self, layer: torch.nn.Module, x: torch.Tensor, bias: Optional[torch.Tensor] = None) -> torch.Tensor:
        qweight = layer.qweight
        scales = layer.scales
        qzeros = layer.qzeros
        pack_factor = self.quant_config.pack_factor
        out_shape = (x.shape[:-1] + (qweight.shape[-1] * pack_factor, ))
        reshaped_x = x.reshape(-1, x.shape[-1])

        weight = torch.ops.hpu.convert_from_uint4(qweight, scales, qzeros, x.dtype)
        out = torch.matmul(reshaped_x, weight)

        if bias is not None:
            out.add_(bias)
        return out.reshape(out_shape)

__class__ 实例属性

__class__ = type(
    __name__, (__class__, LinearMethodBase), dict(__dict__)
)

quant_config 实例属性

quant_config = quant_config

__init__

__init__(quant_config: AWQHPUConfig)
源代码在 vllm_gaudi/ops/hpu_awq.py
def __init__(self, quant_config: AWQHPUConfig):
    _, LinearMethodBase, _ = get_linear_classes()
    if not issubclass(self.__class__, LinearMethodBase):
        self.__class__ = type(
            self.__class__.__name__,
            (self.__class__, LinearMethodBase),
            dict(self.__class__.__dict__),
        )
    self.quant_config = quant_config

apply

apply(
    layer: Module, x: Tensor, bias: Optional[Tensor] = None
) -> Tensor
源代码在 vllm_gaudi/ops/hpu_awq.py
def apply(self, layer: torch.nn.Module, x: torch.Tensor, bias: Optional[torch.Tensor] = None) -> torch.Tensor:
    qweight = layer.qweight
    scales = layer.scales
    qzeros = layer.qzeros
    pack_factor = self.quant_config.pack_factor
    out_shape = (x.shape[:-1] + (qweight.shape[-1] * pack_factor, ))
    reshaped_x = x.reshape(-1, x.shape[-1])

    weight = torch.ops.hpu.convert_from_uint4(qweight, scales, qzeros, x.dtype)
    out = torch.matmul(reshaped_x, weight)

    if bias is not None:
        out.add_(bias)
    return out.reshape(out_shape)

awq_order

awq_order(x)
源代码在 vllm_gaudi/ops/hpu_awq.py
def awq_order(self, x):

    order = [0, 4, 1, 5, 2, 6, 3, 7]
    idx = torch.arange(
        x.shape[-1],
        dtype=torch.int32,
        device=x.device,
    )
    idx = idx.view(-1, 32 // self.quant_config.weight_bits)
    idx = idx[:, order]
    idx = idx.view(-1)

    x = x[:, idx]
    return x

create_weights

create_weights(
    layer: Module,
    input_size_per_partition: int,
    output_partition_sizes: list[int],
    input_size: int,
    output_size: int,
    params_dtype: dtype,
    **extra_weight_attrs,
)
源代码在 vllm_gaudi/ops/hpu_awq.py
def create_weights(self, layer: torch.nn.Module, input_size_per_partition: int, output_partition_sizes: list[int],
                   input_size: int, output_size: int, params_dtype: torch.dtype, **extra_weight_attrs):

    (GroupQuantScaleParameter, PackedvLLMParameter) = get_parameter_classes()
    if input_size_per_partition % self.quant_config.group_size != 0:
        raise ValueError("The input size is not aligned with the quantized "
                         "weight shape. This can be caused by too large "
                         "tensor parallel size.")

    output_size_per_partition = sum(output_partition_sizes)
    if output_size_per_partition % self.quant_config.pack_factor != 0:
        raise ValueError("The output size is not aligned with the quantized "
                         "weight shape. This can be caused by too large "
                         "tensor parallel size.")

    weight_loader = extra_weight_attrs.get("weight_loader")
    qweight = PackedvLLMParameter(data=torch.empty(
        input_size_per_partition,
        output_size_per_partition // self.quant_config.pack_factor,
        dtype=torch.int32,
    ),
                                  input_dim=0,
                                  output_dim=1,
                                  packed_dim=1,
                                  packed_factor=self.quant_config.pack_factor,
                                  weight_loader=weight_loader)

    qzeros = PackedvLLMParameter(data=torch.empty(
        input_size_per_partition // self.quant_config.group_size,
        output_size_per_partition // self.quant_config.pack_factor,
        dtype=torch.int32,
    ),
                                 input_dim=0,
                                 output_dim=1,
                                 packed_dim=1,
                                 packed_factor=self.quant_config.pack_factor,
                                 weight_loader=weight_loader)

    scales = GroupQuantScaleParameter(data=torch.empty(
        input_size_per_partition // self.quant_config.group_size,
        output_size_per_partition,
        dtype=params_dtype,
    ),
                                      input_dim=0,
                                      output_dim=1,
                                      weight_loader=weight_loader)

    qzeros.pack_factor = self.quant_config.pack_factor
    qweight.pack_factor = self.quant_config.pack_factor

    layer.register_parameter("qweight", qweight)
    layer.register_parameter("qzeros", qzeros)
    layer.register_parameter("scales", scales)

pack_tensor

pack_tensor(x)
源代码在 vllm_gaudi/ops/hpu_awq.py
def pack_tensor(self, x):
    wf = torch.tensor(list(range(0, 32, self.quant_config.weight_bits)), dtype=torch.int32).unsqueeze(0)
    xp = torch.sum(torch.bitwise_left_shift(x.reshape(x.shape[0], -1, (32 // self.quant_config.weight_bits)),
                                            wf.unsqueeze(0)),
                   dim=-1).to(torch.int32)
    return xp

process_weights_after_loading

process_weights_after_loading(layer: Module) -> None
源代码在 vllm_gaudi/ops/hpu_awq.py
def process_weights_after_loading(self, layer: torch.nn.Module) -> None:

    # This unpacking-packing is required because HPU dequant kernel
    # is not compatible with AWQ format
    device = layer.qweight.device
    wq = layer.qweight.cpu()
    zq = layer.qzeros.cpu()
    wqu = self.awq_order(self.unpack_tensor(wq))
    zu = self.awq_order(self.unpack_tensor(zq))
    layer.qweight.data = self.pack_tensor(wqu).to(device)
    layer.qzeros.data = self.pack_tensor(zu).to(device)

    layer.qweight = torch.nn.Parameter(layer.qweight.data, requires_grad=False)
    layer.qzeros = torch.nn.Parameter(layer.qzeros.data, requires_grad=False)
    layer.scales = torch.nn.Parameter(layer.scales.data, requires_grad=False)

unpack_tensor

unpack_tensor(xp)
源代码在 vllm_gaudi/ops/hpu_awq.py
def unpack_tensor(self, xp):
    wf = torch.tensor(list(range(0, 32, self.quant_config.weight_bits)), dtype=torch.int32).unsqueeze(0)
    x = torch.bitwise_right_shift(
        torch.unsqueeze(xp, -1).expand(xp.shape[0], -1, 32 // self.quant_config.weight_bits),
        wf.unsqueeze(0)).to(torch.int8)
    x = torch.bitwise_and(x, (2**self.quant_config.weight_bits) - 1)
    x = x.reshape((x.shape[0], -1))
    return x

get_linear_classes

get_linear_classes()
源代码在 vllm_gaudi/ops/hpu_awq.py
def get_linear_classes():
    from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase, UnquantizedLinearMethod)
    return LinearBase, LinearMethodBase, UnquantizedLinearMethod

get_parameter_classes

get_parameter_classes()
源代码在 vllm_gaudi/ops/hpu_awq.py
def get_parameter_classes():
    from vllm.model_executor.parameter import (
        GroupQuantScaleParameter,
        PackedvLLMParameter,
    )
    return GroupQuantScaleParameter, PackedvLLMParameter

is_layer_skipped_awq

is_layer_skipped_awq(
    prefix: str, modules_to_not_convert: list[str]
)
源代码在 vllm_gaudi/ops/hpu_awq.py
def is_layer_skipped_awq(prefix: str, modules_to_not_convert: list[str]):
    return any(module_name in prefix for module_name in modules_to_not_convert)