diff --git a/nanovllm/engine/model_runner.py b/nanovllm/engine/model_runner.py index c95d120..623fe2d 100644 --- a/nanovllm/engine/model_runner.py +++ b/nanovllm/engine/model_runner.py @@ -35,7 +35,7 @@ class ModelRunner: total, used, _ = get_gpu_memory() free = total * gpu_memory_utilization - used block_bytes = 2 * hf_config.num_hidden_layers * self.block_size * hf_config.num_key_value_heads * hf_config.head_dim * hf_config.torch_dtype.itemsize - config.num_kvcache_blocks = int(free * 1e6) // block_bytes + config.num_kvcache_blocks = int(free) // block_bytes self.kv_cache = torch.zeros(2, hf_config.num_hidden_layers, config.num_kvcache_blocks, self.block_size, hf_config.num_key_value_heads, hf_config.head_dim) layer_id = 0 for module in self.model.modules(): diff --git a/nanovllm/utils/memory.py b/nanovllm/utils/memory.py index 4d87b31..83f7729 100644 --- a/nanovllm/utils/memory.py +++ b/nanovllm/utils/memory.py @@ -1,14 +1,18 @@ import os -import subprocess import torch +from pynvml import * -def get_gpu_memory(device_id: int = 0): +def get_gpu_memory(): torch.cuda.synchronize() - result = subprocess.check_output( - ['nvidia-smi', '-i', str(device_id), '--query-gpu=memory.total,memory.used,memory.free', '--format=csv,nounits,noheader'], - encoding='utf-8' - ) - total_memory, used_memory, free_memory = [int(x) for x in result.strip().split(', ')] + nvmlInit() + visible_device = list(map(int, os.getenv("CUDA_VISIBLE_DEVICES", "0,1,2,3,4,5,6,7").split(','))) + cuda_device_idx = torch.cuda.current_device() + cuda_device_idx = visible_device[cuda_device_idx] + handle = nvmlDeviceGetHandleByIndex(cuda_device_idx) + mem_info = nvmlDeviceGetMemoryInfo(handle) + total_memory = mem_info.total + used_memory = mem_info.used + free_memory = mem_info.free + nvmlShutdown() return total_memory, used_memory, free_memory - \ No newline at end of file