update bench

2025-06-19 23:24:43 +08:00
parent fa0078174e
commit 801365a611
2 changed files with 4 additions and 4 deletions
--- a/README.md
+++ b/README.md
@@ -31,7 +31,7 @@ outputs[0]["text"]
 See `bench.py` for benchmark.
 **Test Configuration:**
- Hardware: RTX 4070
+- Hardware: RTX 4070 Laptop (8GB)
 - Model: Qwen3-0.6B
 - Total Requests: 256 sequences
 - Input Length: Randomly sampled between 100–1024 tokens
@@ -40,5 +40,5 @@ See `bench.py` for benchmark.
 **Performance Results:**
 | Inference Engine | Output Tokens | Time (s) | Throughput (tokens/s) |
 |----------------|-------------|----------|-----------------------|
-| vLLM           | 133,966     | 98.95    | 1353.86               |
+| vLLM           | 133,966     | 98.37    | 1361.84               |
-| Nano-vLLM      | 133,966     | 101.90   | 1314.65               |
+| Nano-vLLM      | 133,966     | 93.41    | 1434.13               |
--- a/bench.py
+++ b/bench.py
@@ -21,7 +21,7 @@ def main():
    llm.generate(["Benchmark: "], SamplingParams())
    t = time.time()
-    llm.generate(prompt_token_ids, sampling_params)
+    llm.generate(prompt_token_ids, sampling_params, use_tqdm=False)
    t = (time.time() - t)
    total_tokens = sum(sp.max_tokens for sp in sampling_params)
    throughput = total_tokens / t