init commit
This commit is contained in:
28
nanovllm/utils/context.py
Normal file
28
nanovllm/utils/context.py
Normal file
@@ -0,0 +1,28 @@
|
||||
from contextlib import contextmanager
|
||||
from dataclasses import dataclass
|
||||
import torch
|
||||
|
||||
|
||||
@dataclass
|
||||
class Context:
|
||||
is_prefill: bool = False
|
||||
cu_seqlens_q: torch.Tensor | None = None
|
||||
cu_seqlens_k: torch.Tensor | None = None
|
||||
max_seqlen_q: int = 0
|
||||
max_seqlen_k: int = 0
|
||||
slot_mapping: torch.Tensor | None = None
|
||||
context_lens: torch.Tensor | None = None
|
||||
block_tables: torch.Tensor | None = None
|
||||
|
||||
_CONTEXT = Context()
|
||||
|
||||
def get_context():
|
||||
return _CONTEXT
|
||||
|
||||
def set_context(is_prefill, cu_seqlens_q=None, cu_seqlens_k=None, max_seqlen_q=0, max_seqlen_k=0, slot_mapping=None, context_lens=None, block_tables=None, ):
|
||||
global _CONTEXT
|
||||
_CONTEXT = Context(is_prefill, cu_seqlens_q, cu_seqlens_k, max_seqlen_q, max_seqlen_k, slot_mapping, context_lens, block_tables)
|
||||
|
||||
def reset_context():
|
||||
global _CONTEXT
|
||||
_CONTEXT = Context()
|
||||
14
nanovllm/utils/memory.py
Normal file
14
nanovllm/utils/memory.py
Normal file
@@ -0,0 +1,14 @@
|
||||
import os
|
||||
import subprocess
|
||||
import torch
|
||||
|
||||
|
||||
def get_gpu_memory(device_id: int = 0):
|
||||
torch.cuda.synchronize()
|
||||
result = subprocess.check_output(
|
||||
['nvidia-smi', '-i', str(device_id), '--query-gpu=memory.total,memory.used,memory.free', '--format=csv,nounits,noheader'],
|
||||
encoding='utf-8'
|
||||
)
|
||||
total_memory, used_memory, free_memory = [int(x) for x in result.strip().split(', ')]
|
||||
return total_memory, used_memory, free_memory
|
||||
|
||||
31
nanovllm/utils/timer.py
Normal file
31
nanovllm/utils/timer.py
Normal file
@@ -0,0 +1,31 @@
|
||||
from contextlib import contextmanager
|
||||
from collections import defaultdict
|
||||
import torch
|
||||
|
||||
|
||||
class CUDATimer:
|
||||
|
||||
def __init__(self):
|
||||
self.events = defaultdict(list)
|
||||
|
||||
@contextmanager
|
||||
def record(self, name, enabled=True):
|
||||
if not enabled:
|
||||
yield
|
||||
else:
|
||||
start, end = torch.cuda.Event(enable_timing=True), torch.cuda.Event(enable_timing=True)
|
||||
self.events[name].append((start, end))
|
||||
start.record()
|
||||
yield
|
||||
end.record()
|
||||
|
||||
def log(self):
|
||||
torch.cuda.synchronize()
|
||||
ret = []
|
||||
for name, events in self.events.items():
|
||||
total = 0
|
||||
count = len(self.events)
|
||||
for start, end in events:
|
||||
total += start.elapsed_time(end)
|
||||
ret.append(f"{name} {total:.2f}ms/{count}times")
|
||||
return ", ".join(ret)
|
||||
Reference in New Issue
Block a user