This commit is contained in:
GeeeekExplorer
2025-06-13 00:41:33 +08:00
parent 98a1551a7d
commit 135d1b38a2
5 changed files with 65 additions and 8 deletions

View File

@@ -1 +1,36 @@
# Nano-VLLM # Nano-vLLM
A lightweight vLLM implementation built from scratch.
## Key Features
* 🚀 **Fase offline inference** - Comparable inference speeds to vLLM
* 📖 **Readable codebase** - Clean implementation under 1,200 lines of Python code
***Optimization Suite** - Prefix caching, Torch compilation, CUDA graph, etc
## Installation
```bash
pip install git+https://github.com/GeeeekExplorer/nano-vllm.git
```
## Quick Start
See `example.py` for usage. The API mirrors vLLM's interface with minor differences in the `LLM.generate` method.
## Benchmark
See `bench.py` for benchmark.
**Test Configuration:**
- Hardware: RTX 4070
- Model: Qwen3-0.6B
- Total Requests: 256 sequences
- Input Length: Randomly sampled between 1001024 tokens
- Output Length: Randomly sampled between 1001024 tokens
**Performance Results:**
| Inference Engine | Output Tokens | Time (s) | Throughput (tokens/s) |
|----------------|-------------|----------|-----------------------|
| vLLM | 133,966 | 98.95 | 1353.86 |
| Nano-vLLM | 133,966 | 101.90 | 1314.65 |

View File

@@ -24,4 +24,4 @@ llm.generate(prompt_token_ids, sampling_params)
t = (time.time() - t) t = (time.time() - t)
total_tokens = sum(sp.max_tokens for sp in sampling_params) total_tokens = sum(sp.max_tokens for sp in sampling_params)
throughput = total_tokens / t throughput = total_tokens / t
print(f"Total: {total_tokens}, Time: {t:.2f}s, Throughput: {throughput: .2f}") print(f"Total: {total_tokens}tok, Time: {t:.2f}s, Throughput: {throughput: .2f}tok/s")

View File

@@ -9,8 +9,8 @@ llm = LLM(path, enforce_eager=True)
sampling_params = SamplingParams(temperature=0.6, max_tokens=256) sampling_params = SamplingParams(temperature=0.6, max_tokens=256)
prompts = [ prompts = [
"自我介绍一下吧!", "introduce yourself",
"列出100内所有素数", "list all prime numbers within 100",
] ]
prompts = [ prompts = [
tokenizer.apply_chat_template( tokenizer.apply_chat_template(

26
pyproject.toml Normal file
View File

@@ -0,0 +1,26 @@
[build-system]
requires = ["setuptools>=61"]
build-backend = "setuptools.build_meta"
[project]
name = "nano-vllm"
version = "0.1.0"
authors = [{ name = "Xingkai Yu" }]
license = "MIT"
license-files = ["LICENSE"]
readme = "README.md"
description = "a mimic VLLM implementation from scratch"
requires-python = ">=3.9,<3.13"
dependencies = [
"torch>=2.4.0",
"triton>=3.0.0",
"transformers>=4.51.0",
"flash-attn",
"nvidia-ml-py",
]
[project.urls]
Homepage="https://github.com/GeeeekExplorer/nano-vllm"
[tool.setuptools]
packages = ["nanovllm"]

View File

@@ -1,4 +0,0 @@
torch
triton
transformers
flash-attn