vllm_gaudi.attention.ops.hpu_paged_attn ¶

_PARTITION_SIZE `模块属性` ¶

_PARTITION_SIZE = 512

HPUPageAttentionInputBuilderBase `数据类` ¶

Source code in vllm_gaudi/attention/ops/hpu_paged_attn.py

@dataclass
class HPUPageAttentionInputBuilderBase:
    pass

init ¶

__init__() -> None

HPUPagedAttention ¶

Source code in vllm_gaudi/attention/ops/hpu_paged_attn.py

class HPUPagedAttention:

    @staticmethod
    def get_supported_head_sizes() -> list[int]:
        return list(range(1, 257))

    @classmethod
    def supports_attn_type(cls, attn_type: str) -> bool:
        """CPU attention supports decoder and encoder-only attention."""
        from vllm.attention import AttentionType

        return attn_type in (
            AttentionType.DECODER,
            AttentionType.ENCODER,
            AttentionType.ENCODER_ONLY,
        )

    @staticmethod
    def get_kv_cache_shape(
        num_blocks: int,
        block_size: int,
        num_kv_heads: int,
        head_size: int,
    ) -> tuple[int, ...]:
        return (num_blocks * block_size, num_kv_heads, head_size)

    @staticmethod
    def split_kv_cache(
        kv_cache: torch.Tensor,
        num_kv_heads: int,
        head_size: int,
    ) -> tuple[torch.Tensor, torch.Tensor]:
        key_cache = kv_cache[0]
        value_cache = kv_cache[1]
        return key_cache, value_cache

    @staticmethod
    def write_to_paged_cache(key: torch.Tensor, value: torch.Tensor, key_cache: torch.Tensor, value_cache: torch.Tensor,
                             slot_mapping: torch.Tensor, kv_cache_dtype: str, is_prompt: bool) -> None:
        cache_ops.reshape_and_cache(key, value, key_cache, value_cache, slot_mapping, kv_cache_dtype, is_prompt)

    @staticmethod
    def forward_decode(**kwargs) -> torch.Tensor:
        if kwargs.get("kv_lora_rank"):
            return ops.flat_pa_mla(**kwargs)
        return ops.flat_pa(**kwargs)

    @staticmethod
    def swap_blocks(
        src_kv_cache: tuple[torch.Tensor, torch.Tensor],
        dst_kv_cache: tuple[torch.Tensor, torch.Tensor],
        src_to_dsts: torch.Tensor,
    ) -> None:
        src_key_cache = src_kv_cache[0]
        dst_key_cache = dst_kv_cache[0]
        cache_ops.swap_blocks(src_key_cache, dst_key_cache, src_to_dsts)

        src_value_cache = src_kv_cache[1]
        dst_value_cache = dst_kv_cache[1]
        cache_ops.swap_blocks(src_value_cache, dst_value_cache, src_to_dsts)

    @staticmethod
    def copy_blocks(
        kv_caches: list[tuple[torch.Tensor, torch.Tensor]],
        src_to_dsts: torch.Tensor,
    ) -> None:
        key_caches = [kv_cache[0] for kv_cache in kv_caches]
        value_caches = [kv_cache[1] for kv_cache in kv_caches]
        cache_ops.copy_blocks(key_caches, value_caches, src_to_dsts)

copy_blocks `静态方法` ¶

copy_blocks(
    kv_caches: list[tuple[Tensor, Tensor]],
    src_to_dsts: Tensor,
) -> None

Source code in vllm_gaudi/attention/ops/hpu_paged_attn.py

@staticmethod
def copy_blocks(
    kv_caches: list[tuple[torch.Tensor, torch.Tensor]],
    src_to_dsts: torch.Tensor,
) -> None:
    key_caches = [kv_cache[0] for kv_cache in kv_caches]
    value_caches = [kv_cache[1] for kv_cache in kv_caches]
    cache_ops.copy_blocks(key_caches, value_caches, src_to_dsts)

forward_decode `静态方法` ¶

forward_decode(**kwargs) -> Tensor

Source code in vllm_gaudi/attention/ops/hpu_paged_attn.py

@staticmethod
def forward_decode(**kwargs) -> torch.Tensor:
    if kwargs.get("kv_lora_rank"):
        return ops.flat_pa_mla(**kwargs)
    return ops.flat_pa(**kwargs)

get_kv_cache_shape `静态方法` ¶

get_kv_cache_shape(
    num_blocks: int,
    block_size: int,
    num_kv_heads: int,
    head_size: int,
) -> tuple[int, ...]

Source code in vllm_gaudi/attention/ops/hpu_paged_attn.py

@staticmethod
def get_kv_cache_shape(
    num_blocks: int,
    block_size: int,
    num_kv_heads: int,
    head_size: int,
) -> tuple[int, ...]:
    return (num_blocks * block_size, num_kv_heads, head_size)

get_supported_head_sizes `静态方法` ¶

get_supported_head_sizes() -> list[int]

Source code in vllm_gaudi/attention/ops/hpu_paged_attn.py

@staticmethod
def get_supported_head_sizes() -> list[int]:
    return list(range(1, 257))

split_kv_cache `静态方法` ¶

split_kv_cache(
    kv_cache: Tensor, num_kv_heads: int, head_size: int
) -> tuple[Tensor, Tensor]

Source code in vllm_gaudi/attention/ops/hpu_paged_attn.py

@staticmethod
def split_kv_cache(
    kv_cache: torch.Tensor,
    num_kv_heads: int,
    head_size: int,
) -> tuple[torch.Tensor, torch.Tensor]:
    key_cache = kv_cache[0]
    value_cache = kv_cache[1]
    return key_cache, value_cache

supports_attn_type `类方法` ¶

supports_attn_type(attn_type: str) -> bool

CPU attention 支持 decoder 和 encoder-only attention。

Source code in vllm_gaudi/attention/ops/hpu_paged_attn.py

@classmethod
def supports_attn_type(cls, attn_type: str) -> bool:
    """CPU attention supports decoder and encoder-only attention."""
    from vllm.attention import AttentionType

    return attn_type in (
        AttentionType.DECODER,
        AttentionType.ENCODER,
        AttentionType.ENCODER_ONLY,
    )

swap_blocks `静态方法` ¶

swap_blocks(
    src_kv_cache: tuple[Tensor, Tensor],
    dst_kv_cache: tuple[Tensor, Tensor],
    src_to_dsts: Tensor,
) -> None

Source code in vllm_gaudi/attention/ops/hpu_paged_attn.py

@staticmethod
def swap_blocks(
    src_kv_cache: tuple[torch.Tensor, torch.Tensor],
    dst_kv_cache: tuple[torch.Tensor, torch.Tensor],
    src_to_dsts: torch.Tensor,
) -> None:
    src_key_cache = src_kv_cache[0]
    dst_key_cache = dst_kv_cache[0]
    cache_ops.swap_blocks(src_key_cache, dst_key_cache, src_to_dsts)

    src_value_cache = src_kv_cache[1]
    dst_value_cache = dst_kv_cache[1]
    cache_ops.swap_blocks(src_value_cache, dst_value_cache, src_to_dsts)

write_to_paged_cache `静态方法` ¶

write_to_paged_cache(
    key: Tensor,
    value: Tensor,
    key_cache: Tensor,
    value_cache: Tensor,
    slot_mapping: Tensor,
    kv_cache_dtype: str,
    is_prompt: bool,
) -> None

Source code in vllm_gaudi/attention/ops/hpu_paged_attn.py

@staticmethod
def write_to_paged_cache(key: torch.Tensor, value: torch.Tensor, key_cache: torch.Tensor, value_cache: torch.Tensor,
                         slot_mapping: torch.Tensor, kv_cache_dtype: str, is_prompt: bool) -> None:
    cache_ops.reshape_and_cache(key, value, key_cache, value_cache, slot_mapping, kv_cache_dtype, is_prompt)

HPUPagedAttentionMetadata `数据类` ¶

PagedAttention 的元数据。

Source code in vllm_gaudi/attention/ops/hpu_paged_attn.py

@dataclass
class HPUPagedAttentionMetadata:
    """Metadata for PagedAttention."""
    block_list: Optional[torch.Tensor]
    block_mapping: Optional[torch.Tensor]
    block_usage: Optional[torch.Tensor]
    block_groups: Optional[torch.Tensor]
    alibi_blocks: Optional[torch.Tensor]

alibi_blocks `实例属性` ¶

alibi_blocks: Optional[Tensor]

block_groups `实例属性` ¶

block_groups: Optional[Tensor]

block_list `实例属性` ¶

block_list: Optional[Tensor]

block_mapping `实例属性` ¶

block_mapping: Optional[Tensor]

block_usage `实例属性` ¶

block_usage: Optional[Tensor]

init ¶

__init__(
    block_list: Optional[Tensor],
    block_mapping: Optional[Tensor],
    block_usage: Optional[Tensor],
    block_groups: Optional[Tensor],
    alibi_blocks: Optional[Tensor],
) -> None

HPUPagedAttentionMetadataBuilder `数据类` ¶

Bases: AttentionMetadataBuilder

Source code in vllm_gaudi/attention/ops/hpu_paged_attn.py

@dataclass
class HPUPagedAttentionMetadataBuilder(AttentionMetadataBuilder):

    def __init__(self, input_builder: "HPUPageAttentionInputBuilderBase") -> None:
        """Create the builder, remember some configuration and parameters."""
        self.input_builder = input_builder

    def prepare(self) -> None:
        """Prepare for one batch."""
        pass

    def build(self, seq_lens: list[int], query_lens: list[int], cuda_graph_pad_size: int,
              batch_size: int) -> type[HPUPagedAttentionMetadata]:
        """Build attention metadata with on-device tensors."""
        return HPUPagedAttentionMetadata

input_builder `实例属性` ¶

input_builder = input_builder

init ¶

__init__(
    input_builder: HPUPageAttentionInputBuilderBase,
) -> None

创建构建器，记住一些配置和参数。

Source code in vllm_gaudi/attention/ops/hpu_paged_attn.py

def __init__(self, input_builder: "HPUPageAttentionInputBuilderBase") -> None:
    """Create the builder, remember some configuration and parameters."""
    self.input_builder = input_builder

build ¶

build(
    seq_lens: list[int],
    query_lens: list[int],
    cuda_graph_pad_size: int,
    batch_size: int,
) -> type[HPUPagedAttentionMetadata]

使用设备上的张量构建 attention 元数据。

Source code in vllm_gaudi/attention/ops/hpu_paged_attn.py

def build(self, seq_lens: list[int], query_lens: list[int], cuda_graph_pad_size: int,
          batch_size: int) -> type[HPUPagedAttentionMetadata]:
    """Build attention metadata with on-device tensors."""
    return HPUPagedAttentionMetadata

prepare ¶

prepare() -> None

为单个 batch 准备。

Source code in vllm_gaudi/attention/ops/hpu_paged_attn.py

def prepare(self) -> None:
    """Prepare for one batch."""
    pass

vllm_gaudi.attention.ops.hpu_paged_attn ¶

_PARTITION_SIZE 模块属性 ¶

HPUPageAttentionInputBuilderBase 数据类 ¶

__init__ ¶

HPUPagedAttention ¶

copy_blocks 静态方法 ¶

forward_decode 静态方法 ¶

get_kv_cache_shape 静态方法 ¶

get_supported_head_sizes 静态方法 ¶

split_kv_cache 静态方法 ¶

supports_attn_type 类方法 ¶

swap_blocks 静态方法 ¶

write_to_paged_cache 静态方法 ¶

HPUPagedAttentionMetadata 数据类 ¶

alibi_blocks 实例属性 ¶

block_groups 实例属性 ¶

block_list 实例属性 ¶

block_mapping 实例属性 ¶

block_usage 实例属性 ¶

__init__ ¶

HPUPagedAttentionMetadataBuilder 数据类 ¶

input_builder 实例属性 ¶

__init__ ¶

build ¶

prepare ¶

_PARTITION_SIZE `模块属性` ¶

HPUPageAttentionInputBuilderBase `数据类` ¶

init ¶

copy_blocks `静态方法` ¶

forward_decode `静态方法` ¶

get_kv_cache_shape `静态方法` ¶

get_supported_head_sizes `静态方法` ¶

split_kv_cache `静态方法` ¶

supports_attn_type `类方法` ¶

swap_blocks `静态方法` ¶

write_to_paged_cache `静态方法` ¶

HPUPagedAttentionMetadata `数据类` ¶

alibi_blocks `实例属性` ¶

block_groups `实例属性` ¶

block_list `实例属性` ¶

block_mapping `实例属性` ¶

block_usage `实例属性` ¶

init ¶

HPUPagedAttentionMetadataBuilder `数据类` ¶

input_builder `实例属性` ¶

init ¶