fix

2025-06-11 21:17:23 +08:00
parent 08c84ec08d
commit fee58d44e4
5 changed files with 4 additions and 36 deletions
--- a/bench.py
+++ b/bench.py
@@ -15,6 +15,6 @@ prompt_token_ids = torch.randint(0, 10240, (batch_size, seq_len)).tolist()
 sampling_params = SamplingParams(temperature=0.6, ignore_eos=True, max_tokens=max_tokens)
 t = time.time()
-completions = llm.generate(prompt_token_ids, sampling_params)
+llm.generate(prompt_token_ids, sampling_params)
 throughput = batch_size * max_tokens / (time.time() - t)
 print(f"Throughput: {throughput: .2f}")
--- a/nanovllm/engine/llm_engine.py
+++ b/nanovllm/engine/llm_engine.py
@@ -52,7 +52,7 @@ class LLMEngine:
                desc="Generating",
                dynamic_ncols=True,
            )
-        if not isinstance(SamplingParams, list):
+        if not isinstance(sampling_params, list):
            sampling_params = [sampling_params] * len(prompts)
        for prompt, sp in zip(prompts, sampling_params):
            self.add_request(prompt, sp)
--- a/nanovllm/engine/model_runner.py
+++ b/nanovllm/engine/model_runner.py
@@ -170,7 +170,7 @@ class ModelRunner:
        context_lens = torch.zeros(max_bs, dtype=torch.int32)
        block_tables = torch.zeros(max_bs, max_num_blocks, dtype=torch.int32)
        outputs = torch.zeros(max_bs, hf_config.hidden_size)
-        self.graph_bs = [1, 2, 4, 8, 16] + list(range(16, max_bs + 1, 16))
+        self.graph_bs = [1, 2, 4, 8] + list(range(16, max_bs + 1, 16))
        self.graphs = {}
        self.graph_pool = None
--- a/nanovllm/utils/timer.py
+++ b/nanovllm/utils/timer.py
@@ -1,31 +0,0 @@
 from contextlib import contextmanager
 from collections import defaultdict
 import torch
 class CUDATimer:
    def __init__(self):
        self.events = defaultdict(list)
    @contextmanager
    def record(self, name, enabled=True):
        if not enabled:
            yield
        else:
            start, end = torch.cuda.Event(enable_timing=True), torch.cuda.Event(enable_timing=True)
            self.events[name].append((start, end))
            start.record()
            yield
            end.record()
    def log(self):
        torch.cuda.synchronize()
        ret = []
        for name, events in self.events.items():
            total = 0
            count = len(self.events)
            for start, end in events:
                total += start.elapsed_time(end)
            ret.append(f"{name} {total:.2f}ms/{count}times")
        return ", ".join(ret)
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,4 @@
 torch
 triton
 transformers
-cmake
+flash-attn
 ninja