warmup and allocate

2025-06-27 01:51:57 +08:00
parent cfc4cb6710
commit 658520b788
4 changed files with 25 additions and 8 deletions
--- a/nanovllm/config.py
+++ b/nanovllm/config.py
@@ -6,7 +6,7 @@ from transformers import AutoConfig
@dataclass
 class Config:
    model: str
-    max_num_batched_tokens: int = 32768
+    max_num_batched_tokens: int = 16384
    max_num_seqs: int = 512
    max_model_len: int = 4096
    gpu_memory_utilization: float = 0.9
@@ -23,3 +23,4 @@ class Config:
        assert 1 <= self.tensor_parallel_size <= 8
        self.hf_config = AutoConfig.from_pretrained(self.model)
        self.max_model_len = min(self.max_model_len, self.hf_config.max_position_embeddings)
+        assert self.max_num_batched_tokens >= self.max_model_len