[fix] fix bench*.py.

This commit is contained in:
Zijie Tian
2025-12-22 19:53:50 +08:00
parent 051f2295c9
commit 08d83185ce
5 changed files with 160 additions and 32 deletions

59
.claude/cclsp.json Normal file
View File

@@ -0,0 +1,59 @@
{
"servers": [
{
"extensions": [
"py",
"pyi"
],
"command": [
"uvx",
"--from",
"python-lsp-server",
"pylsp"
],
"rootDir": ".",
"restartInterval": 5,
"initializationOptions": {
"settings": {
"pylsp": {
"plugins": {
"jedi_completion": {
"enabled": true
},
"jedi_definition": {
"enabled": true
},
"jedi_hover": {
"enabled": true
},
"jedi_references": {
"enabled": true
},
"jedi_signature_help": {
"enabled": true
},
"jedi_symbols": {
"enabled": true
},
"pylint": {
"enabled": false
},
"pycodestyle": {
"enabled": false
},
"pyflakes": {
"enabled": false
},
"yapf": {
"enabled": false
},
"rope_completion": {
"enabled": false
}
}
}
}
}
}
]
}

View File

@@ -235,3 +235,62 @@ Warmup uses a reasonable sequence length (`block_size * 2`) instead of `max_mode
| `gpu_memory_utilization` | 0.9 | GPU memory fraction for KV cache | | `gpu_memory_utilization` | 0.9 | GPU memory fraction for KV cache |
| `enforce_eager` | False | Disable CUDA graphs if True | | `enforce_eager` | False | Disable CUDA graphs if True |
| `num_prefetch_blocks` | 2 | Ring buffer pipeline depth (deprecated, uses num_gpu_blocks) | | `num_prefetch_blocks` | 2 | Ring buffer pipeline depth (deprecated, uses num_gpu_blocks) |
## Benchmarking
### Benchmark Files
| File | Purpose | Key Parameters |
|------|---------|----------------|
| `bench.py` | Standard GPU benchmark | Pure GPU inference |
| `bench_offload.py` | CPU offload benchmark | `enable_cpu_offload=True`, `num_gpu_blocks=8` |
| `bench_vllm.py` | vLLM comparison | Uses vLLM API for baseline comparison |
### Current Test Configuration
All benchmark files are aligned to use:
- **Model**: `~/models/Qwen3-0.6B/`
- **max_model_len**: 40960 (limited by model's `max_position_embeddings`)
- **Prefill test**: input_len = max_len - 1 (40959 tokens)
- **Decode test**: input_len = max_len - 128, output_len = 128
### Common Issues and Solutions
**1. `max_num_batched_tokens` assertion error**
```
AssertionError: assert self.max_num_batched_tokens >= self.max_model_len
```
**Solution**: Set `max_num_batched_tokens=max_model_len` when using large context lengths.
**2. CUDA graph block_tables dimension mismatch**
```
RuntimeError: The expanded size of the tensor (1) must match the existing size (2)
```
**Cause**: `input_len + output_len > max_model_len` causes more blocks than pre-allocated in CUDA graph.
**Solution**: Ensure `input_len + output_len <= max_model_len`.
**3. RoPE position embedding out of bounds**
```
Assertion `index out of bounds: 0 <= ... < 40960` failed
```
**Cause**: Sequence length exceeds model's `max_position_embeddings`.
**Solution**: Check model's `config.json` for `max_position_embeddings` and limit `max_model_len` accordingly.
### Model Context Length Limits
| Model | max_position_embeddings | Notes |
|-------|------------------------|-------|
| Qwen3-0.6B | 40960 | ~40K context |
| Qwen3-4B | 40960 | ~40K context |
| Qwen2.5-7B-Instruct-1M | 1048576 | 1M context |
**Important**: Always check `max_position_embeddings` in `config.json` before setting `max_model_len`.
### Performance Reference (Qwen3-0.6B, 40K context)
| Mode | Prefill (tok/s) | Decode (tok/s) |
|------|-----------------|----------------|
| GPU (bench.py) | ~18,000 | ~100 |
| CPU Offload (bench_offload.py) | ~7,200 | ~3.5 |
CPU offload trades performance for memory efficiency, enabling long-context inference on limited GPU memory.

View File

@@ -4,18 +4,18 @@ from random import randint, seed
from nanovllm import LLM, SamplingParams from nanovllm import LLM, SamplingParams
def bench_decode(llm, num_seqs, max_input_len, max_output_len): def bench_decode(llm, num_seqs, input_len, output_len):
"""Benchmark decode performance (original test)""" """Benchmark decode performance (original test)"""
seed(0) seed(0)
prompt_token_ids = [[randint(0, 10000) for _ in range(randint(100, max_input_len))] for _ in range(num_seqs)] prompt_token_ids = [[randint(0, 10000) for _ in range(input_len)] for _ in range(num_seqs)]
sampling_params = [SamplingParams(temperature=0.6, ignore_eos=True, max_tokens=randint(100, max_output_len)) for _ in range(num_seqs)] sampling_params = SamplingParams(temperature=0.6, ignore_eos=True, max_tokens=output_len)
t = time.time() t = time.time()
llm.generate(prompt_token_ids, sampling_params, use_tqdm=False) llm.generate(prompt_token_ids, sampling_params, use_tqdm=False)
t = time.time() - t t = time.time() - t
total_output_tokens = sum(sp.max_tokens for sp in sampling_params) total_output_tokens = num_seqs * output_len
throughput = total_output_tokens / t throughput = total_output_tokens / t
print(f"[Decode] Output: {total_output_tokens}tok, Time: {t:.2f}s, Throughput: {throughput:.2f}tok/s") print(f"[Decode] Input: {num_seqs}x{input_len}tok, Output: {total_output_tokens}tok, Time: {t:.2f}s, Throughput: {throughput:.2f}tok/s")
def bench_prefill(llm, num_seqs, input_len): def bench_prefill(llm, num_seqs, input_len):
@@ -34,8 +34,10 @@ def bench_prefill(llm, num_seqs, input_len):
def main(): def main():
path = os.path.expanduser("~/models/Qwen3-4B-Instruct-2507/") path = os.path.expanduser("~/models/Qwen3-0.6B/")
llm = LLM(path, enforce_eager=False, max_model_len=4096) # Note: Qwen3-0.6B max_position_embeddings = 40960, cannot exceed this
max_len = 40960
llm = LLM(path, enforce_eager=False, max_model_len=max_len, max_num_batched_tokens=max_len)
# Warmup # Warmup
llm.generate(["Benchmark: "], SamplingParams()) llm.generate(["Benchmark: "], SamplingParams())
@@ -45,15 +47,15 @@ def main():
print("=" * 60) print("=" * 60)
# bench_prefill(llm, num_seqs=1, input_len=1024) # bench_prefill(llm, num_seqs=1, input_len=1024)
# bench_prefill(llm, num_seqs=1, input_len=2048) # bench_prefill(llm, num_seqs=1, input_len=2048)
bench_prefill(llm, num_seqs=1, input_len=4095) bench_prefill(llm, num_seqs=1, input_len=max_len - 1)
# bench_prefill(llm, num_seqs=16, input_len=1024) # bench_prefill(llm, num_seqs=16, input_len=1024)
# bench_prefill(llm, num_seqs=64, input_len=1024) # bench_prefill(llm, num_seqs=64, input_len=1024)
print("=" * 60) print("=" * 60)
print("Decode Benchmark") print("Decode Benchmark")
print("=" * 60) print("=" * 60)
# bench_decode(llm, num_seqs=1, max_input_len=1024, max_output_len=1024) # bench_decode(llm, num_seqs=1, input_len=1024, output_len=1024)
bench_decode(llm, num_seqs=1, max_input_len=4072, max_output_len=16) bench_decode(llm, num_seqs=1, input_len=max_len - 128, output_len=128) # input + output <= max_len
if __name__ == "__main__": if __name__ == "__main__":

View File

@@ -9,18 +9,18 @@ from nanovllm.kvcache.sparse.hybrid import HybridPolicy
from nanovllm.kvcache.sparse.full_policy import FullAttentionPolicy from nanovllm.kvcache.sparse.full_policy import FullAttentionPolicy
def bench_decode(llm, num_seqs, input_len, max_output_len): def bench_decode(llm, num_seqs, input_len, output_len):
"""Benchmark decode performance (original test)""" """Benchmark decode performance (original test)"""
seed(0) seed(0)
prompt_token_ids = [[randint(0, 10000) for _ in range(input_len)] for _ in range(num_seqs)] prompt_token_ids = [[randint(0, 10000) for _ in range(input_len)] for _ in range(num_seqs)]
sampling_params = [SamplingParams(temperature=0.6, ignore_eos=True, max_tokens=max_output_len) for _ in range(num_seqs)] sampling_params = SamplingParams(temperature=0.6, ignore_eos=True, max_tokens=output_len)
t = time.time() t = time.time()
llm.generate(prompt_token_ids, sampling_params, use_tqdm=False) llm.generate(prompt_token_ids, sampling_params, use_tqdm=False)
t = time.time() - t t = time.time() - t
total_output_tokens = sum(sp.max_tokens for sp in sampling_params) total_output_tokens = num_seqs * output_len
throughput = total_output_tokens / t throughput = total_output_tokens / t
print(f"[Decode] Output: {total_output_tokens}tok, Time: {t:.2f}s, Throughput: {throughput:.2f}tok/s") print(f"[Decode] Input: {num_seqs}x{input_len}tok, Output: {total_output_tokens}tok, Time: {t:.2f}s, Throughput: {throughput:.2f}tok/s")
def bench_prefill(llm, num_seqs, input_len): def bench_prefill(llm, num_seqs, input_len):
@@ -95,18 +95,20 @@ def main():
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument("--no-sparse", action="store_true", help="Disable sparse attention (baseline)") parser.add_argument("--no-sparse", action="store_true", help="Disable sparse attention (baseline)")
parser.add_argument("--topk", type=int, default=8, help="Top-K blocks for Quest") parser.add_argument("--topk", type=int, default=8, help="Top-K blocks for Quest")
parser.add_argument("--input-len", type=int, default=128 * 1024, help="Input length in tokens") parser.add_argument("--input-len", type=int, default=None, help="Input length in tokens (default: max_len - 1 for prefill, max_len - output_len for decode)")
parser.add_argument("--output-len", type=int, default=128, help="Output length in tokens") parser.add_argument("--output-len", type=int, default=128, help="Output length in tokens")
args = parser.parse_args() args = parser.parse_args()
path = os.path.expanduser("~/models/Qwen3-4B-Instruct-2507/") path = os.path.expanduser("~/models/Qwen3-0.6B/")
# Note: Qwen3-0.6B max_position_embeddings = 40960, cannot exceed this
max_len = 40960
llm = LLM( llm = LLM(
path, path,
enforce_eager=False, enforce_eager=False,
max_model_len=256 * 1024, max_model_len=max_len,
max_num_batched_tokens=256 * 1024, max_num_batched_tokens=max_len,
enable_cpu_offload=True, enable_cpu_offload=True,
num_gpu_blocks=120, num_gpu_blocks=8, # Small GPU buffer for offload testing
num_prefetch_blocks=4, num_prefetch_blocks=4,
) )
@@ -120,15 +122,19 @@ def main():
# Warmup # Warmup
llm.generate(["Benchmark: "], SamplingParams()) llm.generate(["Benchmark: "], SamplingParams())
# Default input lengths based on max_len
prefill_input_len = args.input_len if args.input_len else max_len - 1
decode_input_len = args.input_len if args.input_len else max_len - args.output_len
print("=" * 60) print("=" * 60)
print("Prefill Benchmark (CPU Offload)") print("Prefill Benchmark (CPU Offload)")
print("=" * 60) print("=" * 60)
bench_prefill(llm, num_seqs=1, input_len=args.input_len) bench_prefill(llm, num_seqs=1, input_len=prefill_input_len)
print("=" * 60) print("=" * 60)
print("Decode Benchmark (CPU Offload)") print("Decode Benchmark (CPU Offload)")
print("=" * 60) print("=" * 60)
bench_decode(llm, num_seqs=1, input_len=args.input_len, max_output_len=args.output_len) bench_decode(llm, num_seqs=1, input_len=decode_input_len, output_len=args.output_len)
if __name__ == "__main__": if __name__ == "__main__":

View File

@@ -5,19 +5,19 @@ from random import randint, seed
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
def bench_decode(llm, num_seqs, max_input_len, max_output_len): def bench_decode(llm, num_seqs, input_len, output_len):
"""Benchmark decode performance (original test)""" """Benchmark decode performance (original test)"""
seed(0) seed(0)
prompt_token_ids = [[randint(0, 10000) for _ in range(randint(100, max_input_len))] for _ in range(num_seqs)] prompt_token_ids = [[randint(0, 10000) for _ in range(input_len)] for _ in range(num_seqs)]
sampling_params = [SamplingParams(temperature=0.6, ignore_eos=True, max_tokens=randint(100, max_output_len)) for _ in range(num_seqs)] sampling_params = SamplingParams(temperature=0.6, ignore_eos=True, max_tokens=output_len)
prompt_token_ids = [dict(prompt_token_ids=p) for p in prompt_token_ids] prompt_token_ids = [dict(prompt_token_ids=p) for p in prompt_token_ids]
t = time.time() t = time.time()
llm.generate(prompt_token_ids, sampling_params, use_tqdm=False) llm.generate(prompt_token_ids, sampling_params, use_tqdm=False)
t = time.time() - t t = time.time() - t
total_output_tokens = sum(sp.max_tokens for sp in sampling_params) total_output_tokens = num_seqs * output_len
throughput = total_output_tokens / t throughput = total_output_tokens / t
print(f"[Decode] Output: {total_output_tokens}tok, Time: {t:.2f}s, Throughput: {throughput:.2f}tok/s") print(f"[Decode] Input: {num_seqs}x{input_len}tok, Output: {total_output_tokens}tok, Time: {t:.2f}s, Throughput: {throughput:.2f}tok/s")
def bench_prefill(llm, num_seqs, input_len): def bench_prefill(llm, num_seqs, input_len):
@@ -37,8 +37,10 @@ def bench_prefill(llm, num_seqs, input_len):
def main(): def main():
path = os.path.expanduser("~/models/Qwen3-4B-Instruct-2507/") path = os.path.expanduser("~/models/Qwen3-0.6B/")
llm = LLM(path, enforce_eager=False, max_model_len=4096, max_num_seqs=128, gpu_memory_utilization=0.9) # Note: Qwen3-0.6B max_position_embeddings = 40960, cannot exceed this
max_len = 40960
llm = LLM(path, enforce_eager=False, max_model_len=max_len, max_num_seqs=128, gpu_memory_utilization=0.9)
# Warmup # Warmup
llm.generate([dict(prompt_token_ids=[0])], SamplingParams()) llm.generate([dict(prompt_token_ids=[0])], SamplingParams())
@@ -46,17 +48,17 @@ def main():
print("=" * 60) print("=" * 60)
print("Prefill Benchmark") print("Prefill Benchmark")
print("=" * 60) print("=" * 60)
bench_prefill(llm, num_seqs=1, input_len=1024) # bench_prefill(llm, num_seqs=1, input_len=1024)
# bench_prefill(llm, num_seqs=1, input_len=2048) # bench_prefill(llm, num_seqs=1, input_len=2048)
# bench_prefill(llm, num_seqs=1, input_len=4095) bench_prefill(llm, num_seqs=1, input_len=max_len - 1)
# bench_prefill(llm, num_seqs=16, input_len=1024) # bench_prefill(llm, num_seqs=16, input_len=1024)
# bench_prefill(llm, num_seqs=64, input_len=1024) # bench_prefill(llm, num_seqs=64, input_len=1024)
print("=" * 60) print("=" * 60)
print("Decode Benchmark") print("Decode Benchmark")
print("=" * 60) print("=" * 60)
bench_decode(llm, num_seqs=1, max_input_len=1024, max_output_len=1024) # bench_decode(llm, num_seqs=1, input_len=1024, output_len=1024)
# bench_decode(llm, num_seqs=256, max_input_len=1024, max_output_len=1024) bench_decode(llm, num_seqs=1, input_len=max_len - 128, output_len=128) # input + output <= max_len
if __name__ == "__main__": if __name__ == "__main__":