跳到内容

llmcompressor.utils.metric_logging

用于指标日志记录和 GPU 内存监控的实用函数。

该模块提供了用于跟踪 GPU 内存使用情况、测量模型层大小以及在压缩工作流中进行全面日志记录的函数。支持 NVIDIA 和 AMD GPU 监控,并提供详细的内存统计信息和性能指标。

CompressionLogger

CompressionLogger(module: Module)

记录与压缩算法相关的指标

参数

  • start_tick

    算法开始时间"

  • losses

    算法产生的损失

源代码位于 llmcompressor/utils/metric_logging.py
def __init__(self, module: torch.nn.Module):
    self.module = module
    self.start_tick = None
    self.loss = None

get_GPU_usage_amd

get_GPU_usage_amd() -> List[Tuple[float, float]]

使用 amdsmi 库获取 AMD GPU 的 GPU 使用情况

源代码位于 llmcompressor/utils/metric_logging.py
def get_GPU_usage_amd() -> List[Tuple[float, float]]:
    """
    get gpu usage for AMD GPUs using amdsmi lib
    """
    usage = []
    try:
        import amdsmi

        try:
            amdsmi.amdsmi_init()
            devices = amdsmi.amdsmi_get_processor_handles()

            for device in devices:
                vram_memory_usage = amdsmi.amdsmi_get_gpu_memory_usage(
                    device, amdsmi.amdsmi_interface.AmdSmiMemoryType.VRAM
                )
                vram_memory_total = amdsmi.amdsmi_get_gpu_memory_total(
                    device, amdsmi.amdsmi_interface.AmdSmiMemoryType.VRAM
                )

                memory_percentage = vram_memory_usage / vram_memory_total
                usage.append(
                    (memory_percentage, vram_memory_total / (1e9)),
                )
            amdsmi.amdsmi_shut_down()
        except amdsmi.AmdSmiException as error:
            logger.warning(f"amdsmi library error:\n {error}")
    except ImportError:
        logger.warning("Failed to obtain GPU usage from amdsmi")

    return usage

get_GPU_usage_nv

get_GPU_usage_nv() -> List[Tuple[float, float]]

使用 nvml 库获取 Nvidia GPU 的 GPU 使用情况

源代码位于 llmcompressor/utils/metric_logging.py
def get_GPU_usage_nv() -> List[Tuple[float, float]]:
    """
    get gpu usage for Nvidia GPUs using nvml lib
    """
    try:
        import pynvml
        from pynvml import NVMLError

        try:
            pynvml.nvmlInit()
        except NVMLError as _err:
            logger.warning(f"Pynml library error:\n {_err}")
            return []

        device_count = pynvml.nvmlDeviceGetCount()
        usage = []  # [(percentage, total_memory_MB)]

        # Iterate through all GPUs
        for i in range(device_count):
            handle = pynvml.nvmlDeviceGetHandleByIndex(i)
            mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
            memory_usage_percentage = mem_info.used / mem_info.total
            total_memory_gb = mem_info.total / (1e9)
            usage.append(
                (memory_usage_percentage, total_memory_gb),
            )
        pynvml.nvmlShutdown()
        return usage

    except ImportError:
        logger.warning("Failed to obtain GPU usage from pynvml")
        return []