diff --git a/README.md b/README.md index 7f93114..90016af 100644 --- a/README.md +++ b/README.md @@ -31,7 +31,7 @@ outputs[0]["text"] See `bench.py` for benchmark. **Test Configuration:** -- Hardware: RTX 4070 +- Hardware: RTX 4070 Laptop (8GB) - Model: Qwen3-0.6B - Total Requests: 256 sequences - Input Length: Randomly sampled between 100–1024 tokens @@ -40,5 +40,5 @@ See `bench.py` for benchmark. **Performance Results:** | Inference Engine | Output Tokens | Time (s) | Throughput (tokens/s) | |----------------|-------------|----------|-----------------------| -| vLLM | 133,966 | 98.95 | 1353.86 | -| Nano-vLLM | 133,966 | 101.90 | 1314.65 | +| vLLM | 133,966 | 98.37 | 1361.84 | +| Nano-vLLM | 133,966 | 93.41 | 1434.13 | diff --git a/bench.py b/bench.py index a7c85f7..8e61d65 100644 --- a/bench.py +++ b/bench.py @@ -21,7 +21,7 @@ def main(): llm.generate(["Benchmark: "], SamplingParams()) t = time.time() - llm.generate(prompt_token_ids, sampling_params) + llm.generate(prompt_token_ids, sampling_params, use_tqdm=False) t = (time.time() - t) total_tokens = sum(sp.max_tokens for sp in sampling_params) throughput = total_tokens / t