[feat] Added bench_offload.py and GreedySampler.
This commit is contained in:
@@ -7,7 +7,7 @@ from multiprocessing.shared_memory import SharedMemory
|
||||
from nanovllm.config import Config
|
||||
from nanovllm.engine.sequence import Sequence
|
||||
from nanovllm.models.qwen3 import Qwen3ForCausalLM
|
||||
from nanovllm.layers.sampler import Sampler
|
||||
from nanovllm.layers.sampler import GreedySampler
|
||||
from nanovllm.utils.context import set_context, get_context, reset_context
|
||||
from nanovllm.utils.loader import load_model
|
||||
from nanovllm.utils.logger import get_logger
|
||||
@@ -34,7 +34,7 @@ class ModelRunner:
|
||||
torch.set_default_device("cuda")
|
||||
self.model = Qwen3ForCausalLM(hf_config)
|
||||
load_model(self.model, config.model)
|
||||
self.sampler = Sampler()
|
||||
self.sampler = GreedySampler()
|
||||
self.warmup_model()
|
||||
self.allocate_kv_cache()
|
||||
if not self.enforce_eager:
|
||||
|
||||
Reference in New Issue
Block a user