init commit

2025-06-10 00:23:23 +08:00
commit a5a4909e6a
26 changed files with 1677 additions and 0 deletions
--- a/nanovllm/utils/context.py
+++ b/nanovllm/utils/context.py
@@ -0,0 +1,28 @@
+from contextlib import contextmanager
+from dataclasses import dataclass
+import torch
+
+
+@dataclass
+class Context:
+    is_prefill: bool = False
+    cu_seqlens_q: torch.Tensor | None = None
+    cu_seqlens_k: torch.Tensor | None = None
+    max_seqlen_q: int = 0
+    max_seqlen_k: int = 0
+    slot_mapping: torch.Tensor | None = None
+    context_lens: torch.Tensor | None = None
+    block_tables: torch.Tensor | None = None
+
+_CONTEXT = Context()
+
+def get_context():
+    return _CONTEXT
+
+def set_context(is_prefill, cu_seqlens_q=None, cu_seqlens_k=None, max_seqlen_q=0, max_seqlen_k=0, slot_mapping=None, context_lens=None, block_tables=None, ):
+    global _CONTEXT
+    _CONTEXT = Context(is_prefill, cu_seqlens_q, cu_seqlens_k, max_seqlen_q, max_seqlen_k, slot_mapping, context_lens, block_tables)
+
+def reset_context():
+    global _CONTEXT
+    _CONTEXT = Context()
--- a/nanovllm/utils/memory.py
+++ b/nanovllm/utils/memory.py
@@ -0,0 +1,14 @@
+import os
+import subprocess
+import torch
+
+
+def get_gpu_memory(device_id: int = 0):
+    torch.cuda.synchronize()
+    result = subprocess.check_output(
+        ['nvidia-smi', '-i', str(device_id), '--query-gpu=memory.total,memory.used,memory.free', '--format=csv,nounits,noheader'],
+        encoding='utf-8'
+    )
+    total_memory, used_memory, free_memory = [int(x) for x in result.strip().split(', ')]
+    return total_memory, used_memory, free_memory
+    
--- a/nanovllm/utils/timer.py
+++ b/nanovllm/utils/timer.py
@@ -0,0 +1,31 @@
+from contextlib import contextmanager
+from collections import defaultdict
+import torch
+
+
+class CUDATimer:
+
+    def __init__(self):
+        self.events = defaultdict(list)
+
+    @contextmanager
+    def record(self, name, enabled=True):
+        if not enabled:
+            yield
+        else:
+            start, end = torch.cuda.Event(enable_timing=True), torch.cuda.Event(enable_timing=True)
+            self.events[name].append((start, end))
+            start.record()
+            yield
+            end.record()
+
+    def log(self):
+        torch.cuda.synchronize()
+        ret = []
+        for name, events in self.events.items():
+            total = 0
+            count = len(self.events)
+            for start, end in events:
+                total += start.elapsed_time(end)
+            ret.append(f"{name} {total:.2f}ms/{count}times")
+        return ", ".join(ret)