diff --git a/README.md b/README.md index 813b88b..4059cf1 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ A lightweight vLLM implementation built from scratch. ## Key Features -* 🚀 **Fase offline inference** - Comparable inference speeds to vLLM +* 🚀 **Fast offline inference** - Comparable inference speeds to vLLM * 📖 **Readable codebase** - Clean implementation under 1,200 lines of Python code * ⚡ **Optimization Suite** - Prefix caching, Torch compilation, CUDA graph, etc diff --git a/bench.py b/bench.py index 99b0cf5..ea977ba 100644 --- a/bench.py +++ b/bench.py @@ -24,4 +24,4 @@ llm.generate(prompt_token_ids, sampling_params) t = (time.time() - t) total_tokens = sum(sp.max_tokens for sp in sampling_params) throughput = total_tokens / t -print(f"Total: {total_tokens}tok, Time: {t:.2f}s, Throughput: {throughput: .2f}tok/s") +print(f"Total: {total_tokens}tok, Time: {t:.2f}s, Throughput: {throughput:.2f}tok/s") diff --git a/nanovllm/engine/block_manager.py b/nanovllm/engine/block_manager.py index fef6645..f916fc4 100644 --- a/nanovllm/engine/block_manager.py +++ b/nanovllm/engine/block_manager.py @@ -86,7 +86,7 @@ class BlockManager: seq.block_table.append(block_id) def deallocate(self, seq: Sequence): - for block_id in seq.block_table: + for block_id in reversed(seq.block_table): block = self.blocks[block_id] block.ref_count -= 1 if block.ref_count == 0: diff --git a/pyproject.toml b/pyproject.toml index 696471e..3a6e1b3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,7 +10,7 @@ license = "MIT" license-files = ["LICENSE"] readme = "README.md" description = "a mimic VLLM implementation from scratch" -requires-python = ">=3.9,<3.13" +requires-python = ">=3.10,<3.13" dependencies = [ "torch>=2.4.0", "triton>=3.0.0",