vllm_gaudi.ops.hpu_gptq ¶

GPTQHPUConfig ¶

Bases: QuantizationConfig

GPTQ 的配置类。

参考：https://arxiv.org/abs/2210.17323

源代码在 vllm_gaudi/ops/hpu_gptq.py

@register_quantization_config("gptq_hpu")
class GPTQHPUConfig(QuantizationConfig):
    """Config class for GPTQ.

    Reference: https://arxiv.org/abs/2210.17323
    """

    def __init__(
        self,
        weight_bits: int,
        group_size: int,
        desc_act: bool,
        lm_head_quantized: bool,
    ) -> None:
        self.weight_bits = weight_bits
        self.group_size = group_size
        self.desc_act = desc_act
        self.lm_head_quantized = lm_head_quantized
        self.pack_factor = Fraction(32, self.weight_bits)
        if self.weight_bits not in [2, 3, 4, 8]:
            raise ValueError("Currently, only 2/3/4/8-bit weight quantization is "
                             f"supported for GPTQ, but got {self.weight_bits} bits.")

    def __repr__(self) -> str:
        return (f"GPTQHPUConfig(weight_bits={self.weight_bits}, "
                f"group_size={self.group_size}, "
                f"desc_act={self.desc_act}),"
                f"lm_head_quantized={self.lm_head_quantized}")

    @classmethod
    def get_name(cls) -> str:
        return "gptq_hpu"

    @classmethod
    def get_supported_act_dtypes(cls) -> list[torch.dtype]:
        return [torch.bfloat16]

    @classmethod
    # Need to figure it out
    def get_min_capability(cls) -> int:
        return 0

    @classmethod
    def get_config_filenames(cls) -> list[str]:
        return ["quantize_config.json"]

    @classmethod
    def from_config(cls, config: dict[str, Any]) -> "GPTQHPUConfig":
        weight_bits = cls.get_from_keys(config, ["bits"])
        group_size = cls.get_from_keys(config, ["group_size"])
        desc_act = cls.get_from_keys(config, ["desc_act"])
        lm_head_quantized = cls.get_from_keys_or(config, ["lm_head"], default=False)
        return cls(weight_bits, group_size, desc_act, lm_head_quantized)

    @classmethod
    def override_quantization_method(cls, hf_quant_cfg, user_quant) -> Optional[str]:

        is_valid_user_quant = user_quant == "gptq_hpu"

        if is_valid_user_quant:
            instance = cls(weight_bits=4, group_size=128, desc_act=True, lm_head_quantized=False)
            return instance.get_name()

        return None

    def get_quant_method(self, layer: torch.nn.Module, prefix: str) -> Optional["GPTQHPULinearMethod"]:
        LinearBase, _ = get_linear_classes()
        from vllm.model_executor.layers.vocab_parallel_embedding \
                import ParallelLMHead
        if (isinstance(layer, LinearBase) or (isinstance(layer, ParallelLMHead) and self.lm_head_quantized)):
            return GPTQHPULinearMethod(self)
        return None

    def get_scaled_act_names(self) -> list[str]:
        return []

desc_act `实例属性` ¶

desc_act = desc_act

group_size `实例属性` ¶

group_size = group_size

lm_head_quantized `实例属性` ¶

lm_head_quantized = lm_head_quantized

pack_factor `实例属性` ¶

pack_factor = Fraction(32, weight_bits)

weight_bits `实例属性` ¶

weight_bits = weight_bits

init ¶

__init__(
    weight_bits: int,
    group_size: int,
    desc_act: bool,
    lm_head_quantized: bool,
) -> None

源代码在 vllm_gaudi/ops/hpu_gptq.py

def __init__(
    self,
    weight_bits: int,
    group_size: int,
    desc_act: bool,
    lm_head_quantized: bool,
) -> None:
    self.weight_bits = weight_bits
    self.group_size = group_size
    self.desc_act = desc_act
    self.lm_head_quantized = lm_head_quantized
    self.pack_factor = Fraction(32, self.weight_bits)
    if self.weight_bits not in [2, 3, 4, 8]:
        raise ValueError("Currently, only 2/3/4/8-bit weight quantization is "
                         f"supported for GPTQ, but got {self.weight_bits} bits.")

repr ¶

__repr__() -> str

源代码在 vllm_gaudi/ops/hpu_gptq.py

def __repr__(self) -> str:
    return (f"GPTQHPUConfig(weight_bits={self.weight_bits}, "
            f"group_size={self.group_size}, "
            f"desc_act={self.desc_act}),"
            f"lm_head_quantized={self.lm_head_quantized}")

from_config `类方法` ¶

from_config(config: dict[str, Any]) -> GPTQHPUConfig

源代码在 vllm_gaudi/ops/hpu_gptq.py

@classmethod
def from_config(cls, config: dict[str, Any]) -> "GPTQHPUConfig":
    weight_bits = cls.get_from_keys(config, ["bits"])
    group_size = cls.get_from_keys(config, ["group_size"])
    desc_act = cls.get_from_keys(config, ["desc_act"])
    lm_head_quantized = cls.get_from_keys_or(config, ["lm_head"], default=False)
    return cls(weight_bits, group_size, desc_act, lm_head_quantized)

get_config_filenames `类方法` ¶

get_config_filenames() -> list[str]

源代码在 vllm_gaudi/ops/hpu_gptq.py

@classmethod
def get_config_filenames(cls) -> list[str]:
    return ["quantize_config.json"]

get_min_capability `类方法` ¶

get_min_capability() -> int

源代码在 vllm_gaudi/ops/hpu_gptq.py

@classmethod
# Need to figure it out
def get_min_capability(cls) -> int:
    return 0

get_name `类方法` ¶

get_name() -> str

源代码在 vllm_gaudi/ops/hpu_gptq.py

@classmethod
def get_name(cls) -> str:
    return "gptq_hpu"

get_quant_method ¶

get_quant_method(
    layer: Module, prefix: str
) -> Optional[GPTQHPULinearMethod]

源代码在 vllm_gaudi/ops/hpu_gptq.py

def get_quant_method(self, layer: torch.nn.Module, prefix: str) -> Optional["GPTQHPULinearMethod"]:
    LinearBase, _ = get_linear_classes()
    from vllm.model_executor.layers.vocab_parallel_embedding \
            import ParallelLMHead
    if (isinstance(layer, LinearBase) or (isinstance(layer, ParallelLMHead) and self.lm_head_quantized)):
        return GPTQHPULinearMethod(self)
    return None

get_scaled_act_names ¶

get_scaled_act_names() -> list[str]

源代码在 vllm_gaudi/ops/hpu_gptq.py

def get_scaled_act_names(self) -> list[str]:
    return []

get_supported_act_dtypes `类方法` ¶

get_supported_act_dtypes() -> list[dtype]

源代码在 vllm_gaudi/ops/hpu_gptq.py

@classmethod
def get_supported_act_dtypes(cls) -> list[torch.dtype]:
    return [torch.bfloat16]

override_quantization_method `类方法` ¶

override_quantization_method(
    hf_quant_cfg, user_quant
) -> Optional[str]

源代码在 vllm_gaudi/ops/hpu_gptq.py

@classmethod
def override_quantization_method(cls, hf_quant_cfg, user_quant) -> Optional[str]:

    is_valid_user_quant = user_quant == "gptq_hpu"

    if is_valid_user_quant:
        instance = cls(weight_bits=4, group_size=128, desc_act=True, lm_head_quantized=False)
        return instance.get_name()

    return None

GPTQHPULinearMethod ¶

GPTQ 的线性方法。

参数

名称	类型	描述	默认值
`quant_config`	`GPTQHPUConfig`	GPTQ 量化配置。	required

源代码在 vllm_gaudi/ops/hpu_gptq.py

class GPTQHPULinearMethod:
    """Linear method for GPTQ.

    Args:
        quant_config: The GPTQ quantization config.
    """

    def __init__(self, quant_config: GPTQHPUConfig):
        _, LinearMethodBase = get_linear_classes()
        if not issubclass(self.__class__, LinearMethodBase):
            self.__class__ = type(
                self.__class__.__name__,
                (self.__class__, LinearMethodBase),
                dict(self.__class__.__dict__),
            )
        self.quant_config = quant_config

    def create_weights(
        self,
        layer: torch.nn.Module,
        input_size_per_partition: int,
        output_partition_sizes: list[int],
        input_size: int,
        output_size: int,
        params_dtype: torch.dtype,
        **extra_weight_attrs,
    ):
        (ChannelQuantScaleParameter, GroupQuantScaleParameter, PackedColumnParameter, PackedvLLMParameter,
         RowvLLMParameter) = get_parameter_classes()

        del output_size  # Unused.
        weight_loader = extra_weight_attrs.get("weight_loader")
        if input_size_per_partition % self.quant_config.group_size != 0:
            raise ValueError("The input size is not aligned with the quantized "
                             "weight shape. This can be caused by too large "
                             "tensor parallel size.")
        output_size_per_partition = sum(output_partition_sizes)
        if (output_size_per_partition % self.quant_config.pack_factor.numerator != 0):
            raise ValueError("The output size is not aligned with the quantized "
                             "weight shape. This can be caused by too large "
                             "tensor parallel size.")

        group_size = self.quant_config.group_size if self.quant_config.group_size != -1 else input_size
        scale_and_zero_size = input_size // group_size

        qweight = PackedvLLMParameter(data=torch.empty(
            input_size_per_partition // self.quant_config.pack_factor,
            output_size_per_partition,
            dtype=torch.int32,
        ),
                                      input_dim=0,
                                      output_dim=1,
                                      packed_dim=0,
                                      packed_factor=self.quant_config.pack_factor,
                                      weight_loader=weight_loader)

        g_idx = RowvLLMParameter(data=torch.tensor(
            [i // self.quant_config.group_size for i in range(input_size_per_partition)],
            dtype=torch.int32,
        ),
                                 input_dim=0,
                                 weight_loader=weight_loader)
        qzeros_args = {
            "data":
            torch.empty(
                scale_and_zero_size,
                output_size_per_partition // self.quant_config.pack_factor,
                dtype=torch.int32,
            ),
            "weight_loader":
            weight_loader
        }
        weight_scale_args = {
            "data": torch.empty(
                scale_and_zero_size,
                output_size_per_partition,
                dtype=params_dtype,
            ),
            "weight_loader": weight_loader
        }

        scales = ChannelQuantScaleParameter(output_dim=1, **weight_scale_args)
        qzeros = PackedColumnParameter(output_dim=1,
                                       packed_dim=1,
                                       packed_factor=self.quant_config.pack_factor,
                                       **qzeros_args)

        qzeros.pack_factor = self.quant_config.pack_factor

        layer.register_parameter("qweight", qweight)
        layer.register_parameter("g_idx", g_idx)
        layer.register_parameter("qzeros", qzeros)
        layer.register_parameter("scales", scales)

    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:

        self.wf = torch.tensor(list(range(0, 32, self.quant_config.weight_bits)), dtype=torch.int32).unsqueeze(0)
        weight = self.unpack_weight_from_cuda_old_format(layer)
        layer.qweight.data = self.pack_tensor(weight).to('hpu')

        zeros = self.unpack_zeros_from_cuda_old_format(layer).cpu()
        layer.qzeros.data = self.pack_tensor(zeros).to('hpu')

        # for torch.compile
        layer.qweight = Parameter(layer.qweight.data, requires_grad=False)
        layer.qzeros = Parameter(layer.qzeros.data, requires_grad=False)
        layer.g_idx = Parameter(layer.g_idx.data, requires_grad=False)
        layer.scales = Parameter(layer.scales.data, requires_grad=False)

    def apply(self, layer: torch.nn.Module, x: torch.Tensor, bias: Optional[torch.Tensor] = None) -> torch.Tensor:

        out_shape = x.shape[:-1]
        if hasattr(layer, 'output_size_per_partition'):
            out_shape += (layer.output_size_per_partition, )
        else:
            out_shape += (layer.output_size, )

        reshaped_x = x.reshape(-1, x.shape[-1])

        weight = torch.ops.hpu.convert_from_uint4(layer.qweight, layer.scales, layer.qzeros, x.dtype, layer.g_idx)
        output = torch.matmul(reshaped_x, weight)

        if bias is not None:
            output.add_(bias)
        return output.reshape(out_shape)

    def pack_tensor(self, input, bits=4):
        normal = input.to(torch.int32)
        q = torch.sum(torch.bitwise_left_shift(normal.reshape(normal.shape[0], -1, (32 // bits)), self.wf.unsqueeze(0)),
                      dim=-1).to(torch.int32)

        return q

    def unpack_zeros_from_cuda_old_format(self, layer):

        bits = self.quant_config.weight_bits
        zeros = torch.bitwise_right_shift(
            torch.unsqueeze(layer.qzeros.to('cpu'), 2).expand(-1, -1, 32 // bits),
            self.wf.unsqueeze(0),
        ).to(torch.int16 if bits == 8 else torch.int8)

        zeros = zeros + 1
        zeros = torch.bitwise_and(zeros, (2**bits) - 1).to(layer.scales.dtype)
        zeros = zeros.reshape(-1, zeros.shape[1] * zeros.shape[2])
        return zeros

    def unpack_weight_from_cuda_old_format(self, layer):

        qweight = layer.qweight.cpu()
        bits = self.quant_config.weight_bits

        weight = torch.bitwise_right_shift(
            torch.unsqueeze(qweight, 1).expand(-1, 32 // bits, -1),
            self.wf.unsqueeze(-1),
        ).to(torch.int16 if bits == 8 else torch.int8)
        weight = torch.bitwise_and(weight, (2**bits) - 1)
        weight = weight.reshape((weight.shape[0] * weight.shape[1], weight.shape[2]))
        return weight

class `实例属性` ¶

__class__ = type(
    __name__, (__class__, LinearMethodBase), dict(__dict__)
)

quant_config `实例属性` ¶

quant_config = quant_config

init ¶

__init__(quant_config: GPTQHPUConfig)

源代码在 vllm_gaudi/ops/hpu_gptq.py

def __init__(self, quant_config: GPTQHPUConfig):
    _, LinearMethodBase = get_linear_classes()
    if not issubclass(self.__class__, LinearMethodBase):
        self.__class__ = type(
            self.__class__.__name__,
            (self.__class__, LinearMethodBase),
            dict(self.__class__.__dict__),
        )
    self.quant_config = quant_config

apply ¶

apply(
    layer: Module, x: Tensor, bias: Optional[Tensor] = None
) -> Tensor

源代码在 vllm_gaudi/ops/hpu_gptq.py

def apply(self, layer: torch.nn.Module, x: torch.Tensor, bias: Optional[torch.Tensor] = None) -> torch.Tensor:

    out_shape = x.shape[:-1]
    if hasattr(layer, 'output_size_per_partition'):
        out_shape += (layer.output_size_per_partition, )
    else:
        out_shape += (layer.output_size, )

    reshaped_x = x.reshape(-1, x.shape[-1])

    weight = torch.ops.hpu.convert_from_uint4(layer.qweight, layer.scales, layer.qzeros, x.dtype, layer.g_idx)
    output = torch.matmul(reshaped_x, weight)

    if bias is not None:
        output.add_(bias)
    return output.reshape(out_shape)

create_weights ¶

create_weights(
    layer: Module,
    input_size_per_partition: int,
    output_partition_sizes: list[int],
    input_size: int,
    output_size: int,
    params_dtype: dtype,
    **extra_weight_attrs,
)

源代码在 vllm_gaudi/ops/hpu_gptq.py

def create_weights(
    self,
    layer: torch.nn.Module,
    input_size_per_partition: int,
    output_partition_sizes: list[int],
    input_size: int,
    output_size: int,
    params_dtype: torch.dtype,
    **extra_weight_attrs,
):
    (ChannelQuantScaleParameter, GroupQuantScaleParameter, PackedColumnParameter, PackedvLLMParameter,
     RowvLLMParameter) = get_parameter_classes()

    del output_size  # Unused.
    weight_loader = extra_weight_attrs.get("weight_loader")
    if input_size_per_partition % self.quant_config.group_size != 0:
        raise ValueError("The input size is not aligned with the quantized "
                         "weight shape. This can be caused by too large "
                         "tensor parallel size.")
    output_size_per_partition = sum(output_partition_sizes)
    if (output_size_per_partition % self.quant_config.pack_factor.numerator != 0):
        raise ValueError("The output size is not aligned with the quantized "
                         "weight shape. This can be caused by too large "
                         "tensor parallel size.")

    group_size = self.quant_config.group_size if self.quant_config.group_size != -1 else input_size
    scale_and_zero_size = input_size // group_size

    qweight = PackedvLLMParameter(data=torch.empty(
        input_size_per_partition // self.quant_config.pack_factor,
        output_size_per_partition,
        dtype=torch.int32,
    ),
                                  input_dim=0,
                                  output_dim=1,
                                  packed_dim=0,
                                  packed_factor=self.quant_config.pack_factor,
                                  weight_loader=weight_loader)

    g_idx = RowvLLMParameter(data=torch.tensor(
        [i // self.quant_config.group_size for i in range(input_size_per_partition)],
        dtype=torch.int32,
    ),
                             input_dim=0,
                             weight_loader=weight_loader)
    qzeros_args = {
        "data":
        torch.empty(
            scale_and_zero_size,
            output_size_per_partition // self.quant_config.pack_factor,
            dtype=torch.int32,
        ),
        "weight_loader":
        weight_loader
    }
    weight_scale_args = {
        "data": torch.empty(
            scale_and_zero_size,
            output_size_per_partition,
            dtype=params_dtype,
        ),
        "weight_loader": weight_loader
    }

    scales = ChannelQuantScaleParameter(output_dim=1, **weight_scale_args)
    qzeros = PackedColumnParameter(output_dim=1,
                                   packed_dim=1,
                                   packed_factor=self.quant_config.pack_factor,
                                   **qzeros_args)

    qzeros.pack_factor = self.quant_config.pack_factor

    layer.register_parameter("qweight", qweight)
    layer.register_parameter("g_idx", g_idx)
    layer.register_parameter("qzeros", qzeros)
    layer.register_parameter("scales", scales)

pack_tensor ¶

pack_tensor(input, bits=4)

源代码在 vllm_gaudi/ops/hpu_gptq.py

def pack_tensor(self, input, bits=4):
    normal = input.to(torch.int32)
    q = torch.sum(torch.bitwise_left_shift(normal.reshape(normal.shape[0], -1, (32 // bits)), self.wf.unsqueeze(0)),
                  dim=-1).to(torch.int32)

    return q

process_weights_after_loading ¶

process_weights_after_loading(layer: Module) -> None

源代码在 vllm_gaudi/ops/hpu_gptq.py

def process_weights_after_loading(self, layer: torch.nn.Module) -> None:

    self.wf = torch.tensor(list(range(0, 32, self.quant_config.weight_bits)), dtype=torch.int32).unsqueeze(0)
    weight = self.unpack_weight_from_cuda_old_format(layer)
    layer.qweight.data = self.pack_tensor(weight).to('hpu')

    zeros = self.unpack_zeros_from_cuda_old_format(layer).cpu()
    layer.qzeros.data = self.pack_tensor(zeros).to('hpu')

    # for torch.compile
    layer.qweight = Parameter(layer.qweight.data, requires_grad=False)
    layer.qzeros = Parameter(layer.qzeros.data, requires_grad=False)
    layer.g_idx = Parameter(layer.g_idx.data, requires_grad=False)
    layer.scales = Parameter(layer.scales.data, requires_grad=False)

unpack_weight_from_cuda_old_format ¶

unpack_weight_from_cuda_old_format(layer)

源代码在 vllm_gaudi/ops/hpu_gptq.py

def unpack_weight_from_cuda_old_format(self, layer):

    qweight = layer.qweight.cpu()
    bits = self.quant_config.weight_bits

    weight = torch.bitwise_right_shift(
        torch.unsqueeze(qweight, 1).expand(-1, 32 // bits, -1),
        self.wf.unsqueeze(-1),
    ).to(torch.int16 if bits == 8 else torch.int8)
    weight = torch.bitwise_and(weight, (2**bits) - 1)
    weight = weight.reshape((weight.shape[0] * weight.shape[1], weight.shape[2]))
    return weight

unpack_zeros_from_cuda_old_format ¶

unpack_zeros_from_cuda_old_format(layer)

源代码在 vllm_gaudi/ops/hpu_gptq.py

def unpack_zeros_from_cuda_old_format(self, layer):

    bits = self.quant_config.weight_bits
    zeros = torch.bitwise_right_shift(
        torch.unsqueeze(layer.qzeros.to('cpu'), 2).expand(-1, -1, 32 // bits),
        self.wf.unsqueeze(0),
    ).to(torch.int16 if bits == 8 else torch.int8)

    zeros = zeros + 1
    zeros = torch.bitwise_and(zeros, (2**bits) - 1).to(layer.scales.dtype)
    zeros = zeros.reshape(-1, zeros.shape[1] * zeros.shape[2])
    return zeros

get_linear_classes ¶

get_linear_classes()

源代码在 vllm_gaudi/ops/hpu_gptq.py

def get_linear_classes():
    from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase
    return LinearBase, LinearMethodBase

get_parameter_classes ¶

get_parameter_classes()

源代码在 vllm_gaudi/ops/hpu_gptq.py

def get_parameter_classes():
    from vllm.model_executor.parameter import (
        ChannelQuantScaleParameter,
        GroupQuantScaleParameter,
        PackedColumnParameter,
        PackedvLLMParameter,
        RowvLLMParameter,
    )
    return (ChannelQuantScaleParameter, GroupQuantScaleParameter, PackedColumnParameter, PackedvLLMParameter,
            RowvLLMParameter)

vllm_gaudi.ops.hpu_gptq ¶

GPTQHPUConfig ¶

desc_act 实例属性 ¶

group_size 实例属性 ¶

lm_head_quantized 实例属性 ¶

pack_factor 实例属性 ¶

weight_bits 实例属性 ¶

__init__ ¶

__repr__ ¶

from_config 类方法 ¶

get_config_filenames 类方法 ¶

get_min_capability 类方法 ¶

get_name 类方法 ¶

get_quant_method ¶

get_scaled_act_names ¶

get_supported_act_dtypes 类方法 ¶

override_quantization_method 类方法 ¶

GPTQHPULinearMethod ¶

__class__ 实例属性 ¶

quant_config 实例属性 ¶

__init__ ¶

apply ¶

create_weights ¶

pack_tensor ¶

process_weights_after_loading ¶

unpack_weight_from_cuda_old_format ¶

unpack_zeros_from_cuda_old_format ¶

get_linear_classes ¶

get_parameter_classes ¶

desc_act `实例属性` ¶

group_size `实例属性` ¶

lm_head_quantized `实例属性` ¶

pack_factor `实例属性` ¶

weight_bits `实例属性` ¶

init ¶

repr ¶

from_config `类方法` ¶

get_config_filenames `类方法` ¶

get_min_capability `类方法` ¶

get_name `类方法` ¶

get_supported_act_dtypes `类方法` ¶

override_quantization_method `类方法` ¶

class `实例属性` ¶

quant_config `实例属性` ¶

init ¶