跳到内容

vllm_gaudi.utils

T 模块属性

T = TypeVar('T')

U 模块属性

U = TypeVar('U')

HPUCompileConfig

配置类,用于保存将传递给带有 HPU 后端的 torch.compile 的参数。

源代码位于 vllm_gaudi/utils.py
class HPUCompileConfig:
    """
    Configuration class, which holds arguments that will be
    passed to torch compile with HPU backend.
    """

    def __init__(self, fullgraph: Optional[bool] = None, dynamic: Optional[bool] = None):
        """
        Allow to override the environment variables for corner case scenarios
        when single functions are compiled with torch.compile decorator.
        Env variables should not be overwritten when it comes to compilation
        of the whole model.
        """
        self.fullgraph = fullgraph if fullgraph is not None else \
            get_config().fullgraph_compilation
        self.dynamic = dynamic if dynamic is not None else \
            get_config().dynamic_shapes_compilation
        self.regional_compilation = get_config().regional_compilation

    def get_compile_args(self) -> dict[str, Any]:
        """
        Returns a dictionary of compile arguments that can be used
        with torch.compile method or decorator
        """
        if self.dynamic:
            return {'backend': 'hpu_backend', 'fullgraph': self.fullgraph, 'options': {"force_static_compile": True}}
        else:
            return {'backend': 'hpu_backend', 'fullgraph': self.fullgraph, 'dynamic': False}

dynamic 实例属性

dynamic = (
    dynamic
    if dynamic is not None
    else dynamic_shapes_compilation
)

fullgraph 实例属性

fullgraph = (
    fullgraph
    if fullgraph is not None
    else fullgraph_compilation
)

regional_compilation 实例属性

regional_compilation = regional_compilation

__init__

__init__(
    fullgraph: Optional[bool] = None,
    dynamic: Optional[bool] = None,
)

允许在单个函数使用 torch.compile 装饰器进行编译的边缘场景中覆盖环境变量。在编译整个模型时,不应覆盖环境变量。

源代码位于 vllm_gaudi/utils.py
def __init__(self, fullgraph: Optional[bool] = None, dynamic: Optional[bool] = None):
    """
    Allow to override the environment variables for corner case scenarios
    when single functions are compiled with torch.compile decorator.
    Env variables should not be overwritten when it comes to compilation
    of the whole model.
    """
    self.fullgraph = fullgraph if fullgraph is not None else \
        get_config().fullgraph_compilation
    self.dynamic = dynamic if dynamic is not None else \
        get_config().dynamic_shapes_compilation
    self.regional_compilation = get_config().regional_compilation

get_compile_args

get_compile_args() -> dict[str, Any]

返回可与 torch.compile 方法或装饰器一起使用的编译参数字典。

源代码位于 vllm_gaudi/utils.py
def get_compile_args(self) -> dict[str, Any]:
    """
    Returns a dictionary of compile arguments that can be used
    with torch.compile method or decorator
    """
    if self.dynamic:
        return {'backend': 'hpu_backend', 'fullgraph': self.fullgraph, 'options': {"force_static_compile": True}}
    else:
        return {'backend': 'hpu_backend', 'fullgraph': self.fullgraph, 'dynamic': False}

async_h2d_copy

async_h2d_copy(
    source, dest_tensor=None, dtype=None, device="hpu"
)

异步将数据从主机传输到设备。

参数

名称 类型 描述 默认值
source

要传输的 CPU 张量或原始数据

required
dest_tensor

可选的预分配的目标张量

None
dtype

如果源是原始数据,则需要

None
device

目标设备

'hpu'

返回

类型 描述

目标设备上的 torch.Tensor

源代码位于 vllm_gaudi/utils.py
def async_h2d_copy(source, dest_tensor=None, dtype=None, device='hpu'):
    """
    Asynchronously transfer data from host to device.

    Args:
        source: CPU tensor or raw data to transfer
        dest_tensor: Optional pre-allocated destination tensor
        dtype: Required if source is raw data
        device: Target device

    Returns:
        torch.Tensor on target device
    """
    if isinstance(source, torch.Tensor):
        if dest_tensor is not None:
            # Copy into pre-allocated destination tensor
            return dest_tensor.copy_(source, non_blocking=True)
        # Create new device tensor and copy
        assert source.device.type == 'cpu', \
            "Source tensor must be on CPU for asynchronous transfer"
        target = torch.empty_like(source, device=device)
        return target.copy_(source, non_blocking=True)
    # Create tensor from data and transfer to device
    if dtype is None:
        raise ValueError("dtype must be specified when source is not a tensor")
    cpu_tensor = torch.tensor(source, dtype=dtype, device='cpu')
    return cpu_tensor.to(device, non_blocking=True)

async_h2d_update

async_h2d_update(
    source: Tensor,
    dest: Tensor,
    indices: list[int],
    device="hpu",
)

从 CPU 张量异步更新设备张量的特定行。

参数

名称 类型 描述 默认值
source Tensor

包含要复制数据的 CPU 张量

required
dest Tensor

要更新的设备张量

required
indices list[int]

要更新 dest 中行的索引列表

required
device

目标设备

'hpu'
源代码位于 vllm_gaudi/utils.py
def async_h2d_update(source: torch.Tensor, dest: torch.Tensor, indices: list[int], device='hpu'):
    """
    Asynchronously update specific rows of a device tensor from a CPU tensor.

    Args:
        source: CPU tensor with data to copy
        dest: Device tensor to update
        indices: List of row indices in dest to update
        device: Target device
    """
    dest[indices] = source[indices].to(device, non_blocking=True)

hpu_backend_string 已缓存

hpu_backend_string()
源代码位于 vllm_gaudi/utils.py
@cache
def hpu_backend_string():
    backend_string = 'hccl' if not is_fake_hpu() else 'gloo'
    return backend_string

hpu_device_string 已缓存

hpu_device_string()
源代码位于 vllm_gaudi/utils.py
@cache
def hpu_device_string():
    device_string = 'hpu' if not is_fake_hpu() else 'cpu'
    return device_string

is_fake_hpu 已缓存

is_fake_hpu() -> bool
源代码位于 vllm_gaudi/utils.py
@cache
def is_fake_hpu() -> bool:
    return os.environ.get('VLLM_USE_FAKE_HPU', '0') != '0'

make_mrope_positions_tensor_with_pad

make_mrope_positions_tensor_with_pad(
    input_positions: list[list[int]],
    input_mrope_positions: list[list[list[int]]],
    max_prompt_len: int,
    pad: int,
) -> list[list[int]]
源代码位于 vllm_gaudi/utils.py
def make_mrope_positions_tensor_with_pad(input_positions: list[list[int]], input_mrope_positions: list[list[list[int]]],
                                         max_prompt_len: int, pad: int) -> list[list[int]]:
    # If no mrope positions, returns a flatten (seq_len,)
    if all(mrope_position is None for mrope_position in input_mrope_positions):
        return make_tensor_with_pad(input_positions, max_len=max_prompt_len, pad=0, dtype=torch.long,
                                    device='cpu').flatten()
    # Otherwise, Qwen2.5-VL expects positions in a (3, seq_len)
    # we are going to pad each seq_data in the list
    # using either MRope values or regular position
    mrope_input_positions: list[list[int]] = [[] for _ in range(3)]
    for idx in range(3):
        for b_idx, input_mrope_position in enumerate(input_mrope_positions):
            positions = input_mrope_position[idx] if input_mrope_position is not None else input_positions[b_idx]
            padding_size = max_prompt_len - len(positions)
            assert padding_size >= 0
            padded_positions = positions \
                + (max_prompt_len - len(positions)) * [pad]
            mrope_input_positions[idx].extend(padded_positions)
    return torch.tensor(mrope_input_positions, dtype=torch.long, device='cpu')

make_ndarray_with_pad_align

make_ndarray_with_pad_align(
    x: list[list[T]],
    pad: T,
    dtype: DTypeLike,
    *,
    max_len_align: int = 1024,
) -> NDArray

从 2D 输入创建填充数组。填充应用于每个内部列表的末尾,直到达到 max_len

源代码位于 vllm_gaudi/utils.py
def make_ndarray_with_pad_align(
    x: list[list[T]],
    pad: T,
    dtype: npt.DTypeLike,
    *,
    max_len_align: int = 1024,
) -> npt.NDArray:
    """
    Make a padded array from 2D inputs.
    The padding is applied to the end of each inner list until it reaches
    `max_len`.
    """
    # Unlike for most functions, map is faster than a genexpr over `len`
    max_len = max(map(len, x), default=0)
    max_len_aligned = math.ceil(max_len / max_len_align) * max_len_align
    padded_x = np.full((len(x), max_len_aligned), pad, dtype=dtype)

    for ind, blocktb in enumerate(x):
        assert len(blocktb) <= max_len_aligned
        padded_x[ind, :len(blocktb)] = blocktb

    return padded_x

make_tensor_with_pad_align

make_tensor_with_pad_align(
    x: list[list[T]],
    pad: T,
    dtype: dtype,
    *,
    max_len_align: int = 1024,
    device: Optional[Union[str, device]] = None,
    pin_memory: bool = False,
) -> Tensor

从 2D 输入创建填充张量。填充应用于每个内部列表的末尾,直到达到 max_len_aligned,max_len_aligned 是 max_len 四舍五入到最接近的 max_len_align

源代码位于 vllm_gaudi/utils.py
def make_tensor_with_pad_align(
    x: list[list[T]],
    pad: T,
    dtype: torch.dtype,
    *,
    max_len_align: int = 1024,
    device: Optional[Union[str, torch.device]] = None,
    pin_memory: bool = False,
) -> torch.Tensor:
    """
    Make a padded tensor from 2D inputs.
    The padding is applied to the end of each inner list until it reaches
    max_len_aligned, max_len_aligned is max_len rounding to the nearest 
    `max_len_align`.
    """
    np_dtype = TORCH_DTYPE_TO_NUMPY_DTYPE[dtype]
    padded_x = make_ndarray_with_pad_align(x, pad, np_dtype, max_len_align=max_len_align)

    tensor = torch.from_numpy(padded_x).to(device)
    if pin_memory:
        tensor = tensor.pin_memory()

    return tensor