warmup and allocate

This commit is contained in:
GeeeekExplorer
2025-06-27 01:51:57 +08:00
parent cfc4cb6710
commit 658520b788
4 changed files with 25 additions and 8 deletions

View File

@@ -6,7 +6,7 @@ from transformers import AutoConfig
@dataclass
class Config:
model: str
max_num_batched_tokens: int = 32768
max_num_batched_tokens: int = 16384
max_num_seqs: int = 512
max_model_len: int = 4096
gpu_memory_utilization: float = 0.9
@@ -23,3 +23,4 @@ class Config:
assert 1 <= self.tensor_parallel_size <= 8
self.hf_config = AutoConfig.from_pretrained(self.model)
self.max_model_len = min(self.max_model_len, self.hf_config.max_position_embeddings)
assert self.max_num_batched_tokens >= self.max_model_len