用于指标日志记录和 GPU 内存监控的实用函数。
该模块提供了用于跟踪 GPU 内存使用情况、测量模型层大小以及在压缩工作流中进行全面日志记录的函数。支持 NVIDIA 和 AMD GPU 监控,并提供详细的内存统计信息和性能指标。
类
CompressionLogger
CompressionLogger(module: Module)
记录与压缩算法相关的指标
参数
源代码位于 llmcompressor/utils/metric_logging.py
| def __init__(self, module: torch.nn.Module):
self.module = module
self.start_tick = None
self.loss = None
|
get_GPU_usage_amd
get_GPU_usage_amd() -> List[Tuple[float, float]]
使用 amdsmi 库获取 AMD GPU 的 GPU 使用情况
源代码位于 llmcompressor/utils/metric_logging.py
| def get_GPU_usage_amd() -> List[Tuple[float, float]]:
"""
get gpu usage for AMD GPUs using amdsmi lib
"""
usage = []
try:
import amdsmi
try:
amdsmi.amdsmi_init()
devices = amdsmi.amdsmi_get_processor_handles()
for device in devices:
vram_memory_usage = amdsmi.amdsmi_get_gpu_memory_usage(
device, amdsmi.amdsmi_interface.AmdSmiMemoryType.VRAM
)
vram_memory_total = amdsmi.amdsmi_get_gpu_memory_total(
device, amdsmi.amdsmi_interface.AmdSmiMemoryType.VRAM
)
memory_percentage = vram_memory_usage / vram_memory_total
usage.append(
(memory_percentage, vram_memory_total / (1e9)),
)
amdsmi.amdsmi_shut_down()
except amdsmi.AmdSmiException as error:
logger.warning(f"amdsmi library error:\n {error}")
except ImportError:
logger.warning("Failed to obtain GPU usage from amdsmi")
return usage
|
get_GPU_usage_nv
get_GPU_usage_nv() -> List[Tuple[float, float]]
使用 nvml 库获取 Nvidia GPU 的 GPU 使用情况
源代码位于 llmcompressor/utils/metric_logging.py
| def get_GPU_usage_nv() -> List[Tuple[float, float]]:
"""
get gpu usage for Nvidia GPUs using nvml lib
"""
try:
import pynvml
from pynvml import NVMLError
try:
pynvml.nvmlInit()
except NVMLError as _err:
logger.warning(f"Pynml library error:\n {_err}")
return []
device_count = pynvml.nvmlDeviceGetCount()
usage = [] # [(percentage, total_memory_MB)]
# Iterate through all GPUs
for i in range(device_count):
handle = pynvml.nvmlDeviceGetHandleByIndex(i)
mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
memory_usage_percentage = mem_info.used / mem_info.total
total_memory_gb = mem_info.total / (1e9)
usage.append(
(memory_usage_percentage, total_memory_gb),
)
pynvml.nvmlShutdown()
return usage
except ImportError:
logger.warning("Failed to obtain GPU usage from pynvml")
return []
|