diff --git a/bench.py b/bench.py index 5789754..0c4825e 100644 --- a/bench.py +++ b/bench.py @@ -15,6 +15,6 @@ prompt_token_ids = torch.randint(0, 10240, (batch_size, seq_len)).tolist() sampling_params = SamplingParams(temperature=0.6, ignore_eos=True, max_tokens=max_tokens) t = time.time() -completions = llm.generate(prompt_token_ids, sampling_params) +llm.generate(prompt_token_ids, sampling_params) throughput = batch_size * max_tokens / (time.time() - t) print(f"Throughput: {throughput: .2f}") diff --git a/nanovllm/engine/llm_engine.py b/nanovllm/engine/llm_engine.py index 0194e6a..8464885 100644 --- a/nanovllm/engine/llm_engine.py +++ b/nanovllm/engine/llm_engine.py @@ -52,7 +52,7 @@ class LLMEngine: desc="Generating", dynamic_ncols=True, ) - if not isinstance(SamplingParams, list): + if not isinstance(sampling_params, list): sampling_params = [sampling_params] * len(prompts) for prompt, sp in zip(prompts, sampling_params): self.add_request(prompt, sp) diff --git a/nanovllm/engine/model_runner.py b/nanovllm/engine/model_runner.py index b803278..c95d120 100644 --- a/nanovllm/engine/model_runner.py +++ b/nanovllm/engine/model_runner.py @@ -170,7 +170,7 @@ class ModelRunner: context_lens = torch.zeros(max_bs, dtype=torch.int32) block_tables = torch.zeros(max_bs, max_num_blocks, dtype=torch.int32) outputs = torch.zeros(max_bs, hf_config.hidden_size) - self.graph_bs = [1, 2, 4, 8, 16] + list(range(16, max_bs + 1, 16)) + self.graph_bs = [1, 2, 4, 8] + list(range(16, max_bs + 1, 16)) self.graphs = {} self.graph_pool = None diff --git a/nanovllm/utils/timer.py b/nanovllm/utils/timer.py deleted file mode 100644 index 0e8b1bc..0000000 --- a/nanovllm/utils/timer.py +++ /dev/null @@ -1,31 +0,0 @@ -from contextlib import contextmanager -from collections import defaultdict -import torch - - -class CUDATimer: - - def __init__(self): - self.events = defaultdict(list) - - @contextmanager - def record(self, name, enabled=True): - if not enabled: - yield - else: - start, end = torch.cuda.Event(enable_timing=True), torch.cuda.Event(enable_timing=True) - self.events[name].append((start, end)) - start.record() - yield - end.record() - - def log(self): - torch.cuda.synchronize() - ret = [] - for name, events in self.events.items(): - total = 0 - count = len(self.events) - for start, end in events: - total += start.elapsed_time(end) - ret.append(f"{name} {total:.2f}ms/{count}times") - return ", ".join(ret) diff --git a/requirements.txt b/requirements.txt index b1f4fd9..402eb01 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,4 @@ torch triton transformers -cmake -ninja \ No newline at end of file +flash-attn \ No newline at end of file