release

2025-06-13 00:41:33 +08:00
parent 98a1551a7d
commit 135d1b38a2
5 changed files with 65 additions and 8 deletions
--- a/README.md
+++ b/README.md
@@ -1 +1,36 @@
-# Nano-VLLM
+# Nano-vLLM
+
+A lightweight vLLM implementation built from scratch.
+
+## Key Features
+
+* 🚀 **Fase offline inference** - Comparable inference speeds to vLLM
+* 📖 **Readable codebase** - Clean implementation under 1,200 lines of Python code
+* ⚡ **Optimization Suite** - Prefix caching, Torch compilation, CUDA graph, etc
+
+## Installation
+
+```bash
+pip install git+https://github.com/GeeeekExplorer/nano-vllm.git
+```
+
+## Quick Start
+
+See `example.py` for usage. The API mirrors vLLM's interface with minor differences in the `LLM.generate` method.
+
+## Benchmark
+
+See `bench.py` for benchmark.
+
+**Test Configuration:**
+- Hardware: RTX 4070
+- Model: Qwen3-0.6B
+- Total Requests: 256 sequences
+- Input Length: Randomly sampled between 100–1024 tokens
+- Output Length: Randomly sampled between 100–1024 tokens
+
+**Performance Results:**
+| Inference Engine | Output Tokens | Time (s) | Throughput (tokens/s) |
+|----------------|-------------|----------|-----------------------|
+| vLLM           | 133,966     | 98.95    | 1353.86               |
+| Nano-vLLM      | 133,966     | 101.90   | 1314.65               |
--- a/bench.py
+++ b/bench.py
@@ -24,4 +24,4 @@ llm.generate(prompt_token_ids, sampling_params)
 t = (time.time() - t)
 total_tokens = sum(sp.max_tokens for sp in sampling_params)
 throughput = total_tokens / t
-print(f"Total: {total_tokens}, Time: {t:.2f}s, Throughput: {throughput: .2f}")
+print(f"Total: {total_tokens}tok, Time: {t:.2f}s, Throughput: {throughput: .2f}tok/s")
--- a/example.py
+++ b/example.py
@@ -9,8 +9,8 @@ llm = LLM(path, enforce_eager=True)

 sampling_params = SamplingParams(temperature=0.6, max_tokens=256)
 prompts = [
-    "自我介绍一下吧！",
-    "列出100内所有素数",
+    "introduce yourself",
+    "list all prime numbers within 100",
 ]
 prompts = [
    tokenizer.apply_chat_template(
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -0,0 +1,26 @@
+[build-system]
+requires = ["setuptools>=61"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "nano-vllm"
+version = "0.1.0"
+authors = [{ name = "Xingkai Yu" }]
+license = "MIT"
+license-files = ["LICENSE"]
+readme = "README.md"
+description = "a mimic VLLM implementation from scratch"
+requires-python = ">=3.9,<3.13"
+dependencies = [
+    "torch>=2.4.0",
+    "triton>=3.0.0",
+    "transformers>=4.51.0",
+    "flash-attn",
+    "nvidia-ml-py",
+]
+
+[project.urls]
+Homepage="https://github.com/GeeeekExplorer/nano-vllm"
+
+[tool.setuptools]
+packages = ["nanovllm"]
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +0,0 @@
-torch
-triton
-transformers
-flash-attn