better
This commit is contained in:
@@ -4,7 +4,7 @@ A lightweight vLLM implementation built from scratch.
|
|||||||
|
|
||||||
## Key Features
|
## Key Features
|
||||||
|
|
||||||
* 🚀 **Fase offline inference** - Comparable inference speeds to vLLM
|
* 🚀 **Fast offline inference** - Comparable inference speeds to vLLM
|
||||||
* 📖 **Readable codebase** - Clean implementation under 1,200 lines of Python code
|
* 📖 **Readable codebase** - Clean implementation under 1,200 lines of Python code
|
||||||
* ⚡ **Optimization Suite** - Prefix caching, Torch compilation, CUDA graph, etc
|
* ⚡ **Optimization Suite** - Prefix caching, Torch compilation, CUDA graph, etc
|
||||||
|
|
||||||
|
|||||||
2
bench.py
2
bench.py
@@ -24,4 +24,4 @@ llm.generate(prompt_token_ids, sampling_params)
|
|||||||
t = (time.time() - t)
|
t = (time.time() - t)
|
||||||
total_tokens = sum(sp.max_tokens for sp in sampling_params)
|
total_tokens = sum(sp.max_tokens for sp in sampling_params)
|
||||||
throughput = total_tokens / t
|
throughput = total_tokens / t
|
||||||
print(f"Total: {total_tokens}tok, Time: {t:.2f}s, Throughput: {throughput: .2f}tok/s")
|
print(f"Total: {total_tokens}tok, Time: {t:.2f}s, Throughput: {throughput:.2f}tok/s")
|
||||||
|
|||||||
@@ -86,7 +86,7 @@ class BlockManager:
|
|||||||
seq.block_table.append(block_id)
|
seq.block_table.append(block_id)
|
||||||
|
|
||||||
def deallocate(self, seq: Sequence):
|
def deallocate(self, seq: Sequence):
|
||||||
for block_id in seq.block_table:
|
for block_id in reversed(seq.block_table):
|
||||||
block = self.blocks[block_id]
|
block = self.blocks[block_id]
|
||||||
block.ref_count -= 1
|
block.ref_count -= 1
|
||||||
if block.ref_count == 0:
|
if block.ref_count == 0:
|
||||||
|
|||||||
@@ -10,7 +10,7 @@ license = "MIT"
|
|||||||
license-files = ["LICENSE"]
|
license-files = ["LICENSE"]
|
||||||
readme = "README.md"
|
readme = "README.md"
|
||||||
description = "a mimic VLLM implementation from scratch"
|
description = "a mimic VLLM implementation from scratch"
|
||||||
requires-python = ">=3.9,<3.13"
|
requires-python = ">=3.10,<3.13"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"torch>=2.4.0",
|
"torch>=2.4.0",
|
||||||
"triton>=3.0.0",
|
"triton>=3.0.0",
|
||||||
|
|||||||
Reference in New Issue
Block a user