release
This commit is contained in:
37
README.md
37
README.md
@@ -1 +1,36 @@
|
||||
# Nano-VLLM
|
||||
# Nano-vLLM
|
||||
|
||||
A lightweight vLLM implementation built from scratch.
|
||||
|
||||
## Key Features
|
||||
|
||||
* 🚀 **Fase offline inference** - Comparable inference speeds to vLLM
|
||||
* 📖 **Readable codebase** - Clean implementation under 1,200 lines of Python code
|
||||
* ⚡ **Optimization Suite** - Prefix caching, Torch compilation, CUDA graph, etc
|
||||
|
||||
## Installation
|
||||
|
||||
```bash
|
||||
pip install git+https://github.com/GeeeekExplorer/nano-vllm.git
|
||||
```
|
||||
|
||||
## Quick Start
|
||||
|
||||
See `example.py` for usage. The API mirrors vLLM's interface with minor differences in the `LLM.generate` method.
|
||||
|
||||
## Benchmark
|
||||
|
||||
See `bench.py` for benchmark.
|
||||
|
||||
**Test Configuration:**
|
||||
- Hardware: RTX 4070
|
||||
- Model: Qwen3-0.6B
|
||||
- Total Requests: 256 sequences
|
||||
- Input Length: Randomly sampled between 100–1024 tokens
|
||||
- Output Length: Randomly sampled between 100–1024 tokens
|
||||
|
||||
**Performance Results:**
|
||||
| Inference Engine | Output Tokens | Time (s) | Throughput (tokens/s) |
|
||||
|----------------|-------------|----------|-----------------------|
|
||||
| vLLM | 133,966 | 98.95 | 1353.86 |
|
||||
| Nano-vLLM | 133,966 | 101.90 | 1314.65 |
|
||||
|
||||
2
bench.py
2
bench.py
@@ -24,4 +24,4 @@ llm.generate(prompt_token_ids, sampling_params)
|
||||
t = (time.time() - t)
|
||||
total_tokens = sum(sp.max_tokens for sp in sampling_params)
|
||||
throughput = total_tokens / t
|
||||
print(f"Total: {total_tokens}, Time: {t:.2f}s, Throughput: {throughput: .2f}")
|
||||
print(f"Total: {total_tokens}tok, Time: {t:.2f}s, Throughput: {throughput: .2f}tok/s")
|
||||
|
||||
@@ -9,8 +9,8 @@ llm = LLM(path, enforce_eager=True)
|
||||
|
||||
sampling_params = SamplingParams(temperature=0.6, max_tokens=256)
|
||||
prompts = [
|
||||
"自我介绍一下吧!",
|
||||
"列出100内所有素数",
|
||||
"introduce yourself",
|
||||
"list all prime numbers within 100",
|
||||
]
|
||||
prompts = [
|
||||
tokenizer.apply_chat_template(
|
||||
|
||||
26
pyproject.toml
Normal file
26
pyproject.toml
Normal file
@@ -0,0 +1,26 @@
|
||||
[build-system]
|
||||
requires = ["setuptools>=61"]
|
||||
build-backend = "setuptools.build_meta"
|
||||
|
||||
[project]
|
||||
name = "nano-vllm"
|
||||
version = "0.1.0"
|
||||
authors = [{ name = "Xingkai Yu" }]
|
||||
license = "MIT"
|
||||
license-files = ["LICENSE"]
|
||||
readme = "README.md"
|
||||
description = "a mimic VLLM implementation from scratch"
|
||||
requires-python = ">=3.9,<3.13"
|
||||
dependencies = [
|
||||
"torch>=2.4.0",
|
||||
"triton>=3.0.0",
|
||||
"transformers>=4.51.0",
|
||||
"flash-attn",
|
||||
"nvidia-ml-py",
|
||||
]
|
||||
|
||||
[project.urls]
|
||||
Homepage="https://github.com/GeeeekExplorer/nano-vllm"
|
||||
|
||||
[tool.setuptools]
|
||||
packages = ["nanovllm"]
|
||||
@@ -1,4 +0,0 @@
|
||||
torch
|
||||
triton
|
||||
transformers
|
||||
flash-attn
|
||||
Reference in New Issue
Block a user