38 lines
1.4 KiB
Python
38 lines
1.4 KiB
Python
import os
|
||
from dataclasses import dataclass
|
||
from transformers import AutoConfig
|
||
|
||
|
||
@dataclass
|
||
class Config:
|
||
model: str
|
||
max_num_batched_tokens: int = 16384
|
||
max_num_seqs: int = 512
|
||
max_model_len: int = 4096
|
||
gpu_memory_utilization: float = 0.9
|
||
tensor_parallel_size: int = 1
|
||
enforce_eager: bool = False
|
||
hf_config: AutoConfig | None = None
|
||
eos: int = -1
|
||
kvcache_block_size: int = 256
|
||
num_kvcache_blocks: int = -1
|
||
|
||
# CPU Offload configuration
|
||
enable_cpu_offload: bool = False
|
||
offload_policy: str = "lru" # "lru", "fifo", or full class path
|
||
num_transfer_streams: int = 4 # Number of CUDA streams for async transfers
|
||
num_gpu_blocks: int = -1 # User-specified GPU blocks count, -1 = auto (use max available)
|
||
num_prefetch_blocks: int = 2 # Prefetch区的block数量,用于三区域GPU Buffer设计
|
||
|
||
# Computed fields for offload (set in __post_init__ or by ModelRunner)
|
||
num_gpu_kvcache_blocks: int = -1
|
||
num_cpu_kvcache_blocks: int = -1
|
||
|
||
def __post_init__(self):
|
||
assert os.path.isdir(self.model)
|
||
assert self.kvcache_block_size % 256 == 0
|
||
assert 1 <= self.tensor_parallel_size <= 8
|
||
self.hf_config = AutoConfig.from_pretrained(self.model)
|
||
self.max_model_len = min(self.max_model_len, self.hf_config.max_position_embeddings)
|
||
assert self.max_num_batched_tokens >= self.max_model_len
|