release

2025-06-13 00:41:33 +08:00
parent 98a1551a7d
commit 135d1b38a2
5 changed files with 65 additions and 8 deletions
--- a/README.md
+++ b/README.md
@@ -1 +1,36 @@
-# Nano-VLLM
+# Nano-vLLM
 A lightweight vLLM implementation built from scratch.
 ## Key Features
 * 🚀 **Fase offline inference** - Comparable inference speeds to vLLM
 * 📖 **Readable codebase** - Clean implementation under 1,200 lines of Python code
 * ⚡ **Optimization Suite** - Prefix caching, Torch compilation, CUDA graph, etc
 ## Installation
 ```bash
 pip install git+https://github.com/GeeeekExplorer/nano-vllm.git
 ```
 ## Quick Start
 See `example.py` for usage. The API mirrors vLLM's interface with minor differences in the `LLM.generate` method.
 ## Benchmark
 See `bench.py` for benchmark.
 **Test Configuration:**
 - Hardware: RTX 4070
 - Model: Qwen3-0.6B
 - Total Requests: 256 sequences
 - Input Length: Randomly sampled between 100–1024 tokens
 - Output Length: Randomly sampled between 100–1024 tokens
 **Performance Results:**
 | Inference Engine | Output Tokens | Time (s) | Throughput (tokens/s) |
 |----------------|-------------|----------|-----------------------|
 | vLLM           | 133,966     | 98.95    | 1353.86               |
 | Nano-vLLM      | 133,966     | 101.90   | 1314.65               |
--- a/bench.py
+++ b/bench.py
@@ -24,4 +24,4 @@ llm.generate(prompt_token_ids, sampling_params)
 t = (time.time() - t)
 total_tokens = sum(sp.max_tokens for sp in sampling_params)
 throughput = total_tokens / t
-print(f"Total: {total_tokens}, Time: {t:.2f}s, Throughput: {throughput: .2f}")
+print(f"Total: {total_tokens}tok, Time: {t:.2f}s, Throughput: {throughput: .2f}tok/s")
--- a/example.py
+++ b/example.py
@@ -9,8 +9,8 @@ llm = LLM(path, enforce_eager=True)
 sampling_params = SamplingParams(temperature=0.6, max_tokens=256)
 prompts = [
-    "自我介绍一下吧！",
+    "introduce yourself",
-    "列出100内所有素数",
+    "list all prime numbers within 100",
 ]
 prompts = [
    tokenizer.apply_chat_template(
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -0,0 +1,26 @@
 [build-system]
 requires = ["setuptools>=61"]
 build-backend = "setuptools.build_meta"
 [project]
 name = "nano-vllm"
 version = "0.1.0"
 authors = [{ name = "Xingkai Yu" }]
 license = "MIT"
 license-files = ["LICENSE"]
 readme = "README.md"
 description = "a mimic VLLM implementation from scratch"
 requires-python = ">=3.9,<3.13"
 dependencies = [
    "torch>=2.4.0",
    "triton>=3.0.0",
    "transformers>=4.51.0",
    "flash-attn",
    "nvidia-ml-py",
 ]
 [project.urls]
 Homepage="https://github.com/GeeeekExplorer/nano-vllm"
 [tool.setuptools]
 packages = ["nanovllm"]
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +0,0 @@
 torch
 triton
 transformers
 flash-attn