[fix] fix bench*.py.
This commit is contained in:
59
.claude/cclsp.json
Normal file
59
.claude/cclsp.json
Normal file
@@ -0,0 +1,59 @@
|
||||
{
|
||||
"servers": [
|
||||
{
|
||||
"extensions": [
|
||||
"py",
|
||||
"pyi"
|
||||
],
|
||||
"command": [
|
||||
"uvx",
|
||||
"--from",
|
||||
"python-lsp-server",
|
||||
"pylsp"
|
||||
],
|
||||
"rootDir": ".",
|
||||
"restartInterval": 5,
|
||||
"initializationOptions": {
|
||||
"settings": {
|
||||
"pylsp": {
|
||||
"plugins": {
|
||||
"jedi_completion": {
|
||||
"enabled": true
|
||||
},
|
||||
"jedi_definition": {
|
||||
"enabled": true
|
||||
},
|
||||
"jedi_hover": {
|
||||
"enabled": true
|
||||
},
|
||||
"jedi_references": {
|
||||
"enabled": true
|
||||
},
|
||||
"jedi_signature_help": {
|
||||
"enabled": true
|
||||
},
|
||||
"jedi_symbols": {
|
||||
"enabled": true
|
||||
},
|
||||
"pylint": {
|
||||
"enabled": false
|
||||
},
|
||||
"pycodestyle": {
|
||||
"enabled": false
|
||||
},
|
||||
"pyflakes": {
|
||||
"enabled": false
|
||||
},
|
||||
"yapf": {
|
||||
"enabled": false
|
||||
},
|
||||
"rope_completion": {
|
||||
"enabled": false
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
59
CLAUDE.md
59
CLAUDE.md
@@ -235,3 +235,62 @@ Warmup uses a reasonable sequence length (`block_size * 2`) instead of `max_mode
|
||||
| `gpu_memory_utilization` | 0.9 | GPU memory fraction for KV cache |
|
||||
| `enforce_eager` | False | Disable CUDA graphs if True |
|
||||
| `num_prefetch_blocks` | 2 | Ring buffer pipeline depth (deprecated, uses num_gpu_blocks) |
|
||||
|
||||
## Benchmarking
|
||||
|
||||
### Benchmark Files
|
||||
|
||||
| File | Purpose | Key Parameters |
|
||||
|------|---------|----------------|
|
||||
| `bench.py` | Standard GPU benchmark | Pure GPU inference |
|
||||
| `bench_offload.py` | CPU offload benchmark | `enable_cpu_offload=True`, `num_gpu_blocks=8` |
|
||||
| `bench_vllm.py` | vLLM comparison | Uses vLLM API for baseline comparison |
|
||||
|
||||
### Current Test Configuration
|
||||
|
||||
All benchmark files are aligned to use:
|
||||
- **Model**: `~/models/Qwen3-0.6B/`
|
||||
- **max_model_len**: 40960 (limited by model's `max_position_embeddings`)
|
||||
- **Prefill test**: input_len = max_len - 1 (40959 tokens)
|
||||
- **Decode test**: input_len = max_len - 128, output_len = 128
|
||||
|
||||
### Common Issues and Solutions
|
||||
|
||||
**1. `max_num_batched_tokens` assertion error**
|
||||
```
|
||||
AssertionError: assert self.max_num_batched_tokens >= self.max_model_len
|
||||
```
|
||||
**Solution**: Set `max_num_batched_tokens=max_model_len` when using large context lengths.
|
||||
|
||||
**2. CUDA graph block_tables dimension mismatch**
|
||||
```
|
||||
RuntimeError: The expanded size of the tensor (1) must match the existing size (2)
|
||||
```
|
||||
**Cause**: `input_len + output_len > max_model_len` causes more blocks than pre-allocated in CUDA graph.
|
||||
**Solution**: Ensure `input_len + output_len <= max_model_len`.
|
||||
|
||||
**3. RoPE position embedding out of bounds**
|
||||
```
|
||||
Assertion `index out of bounds: 0 <= ... < 40960` failed
|
||||
```
|
||||
**Cause**: Sequence length exceeds model's `max_position_embeddings`.
|
||||
**Solution**: Check model's `config.json` for `max_position_embeddings` and limit `max_model_len` accordingly.
|
||||
|
||||
### Model Context Length Limits
|
||||
|
||||
| Model | max_position_embeddings | Notes |
|
||||
|-------|------------------------|-------|
|
||||
| Qwen3-0.6B | 40960 | ~40K context |
|
||||
| Qwen3-4B | 40960 | ~40K context |
|
||||
| Qwen2.5-7B-Instruct-1M | 1048576 | 1M context |
|
||||
|
||||
**Important**: Always check `max_position_embeddings` in `config.json` before setting `max_model_len`.
|
||||
|
||||
### Performance Reference (Qwen3-0.6B, 40K context)
|
||||
|
||||
| Mode | Prefill (tok/s) | Decode (tok/s) |
|
||||
|------|-----------------|----------------|
|
||||
| GPU (bench.py) | ~18,000 | ~100 |
|
||||
| CPU Offload (bench_offload.py) | ~7,200 | ~3.5 |
|
||||
|
||||
CPU offload trades performance for memory efficiency, enabling long-context inference on limited GPU memory.
|
||||
|
||||
22
bench.py
22
bench.py
@@ -4,18 +4,18 @@ from random import randint, seed
|
||||
from nanovllm import LLM, SamplingParams
|
||||
|
||||
|
||||
def bench_decode(llm, num_seqs, max_input_len, max_output_len):
|
||||
def bench_decode(llm, num_seqs, input_len, output_len):
|
||||
"""Benchmark decode performance (original test)"""
|
||||
seed(0)
|
||||
prompt_token_ids = [[randint(0, 10000) for _ in range(randint(100, max_input_len))] for _ in range(num_seqs)]
|
||||
sampling_params = [SamplingParams(temperature=0.6, ignore_eos=True, max_tokens=randint(100, max_output_len)) for _ in range(num_seqs)]
|
||||
prompt_token_ids = [[randint(0, 10000) for _ in range(input_len)] for _ in range(num_seqs)]
|
||||
sampling_params = SamplingParams(temperature=0.6, ignore_eos=True, max_tokens=output_len)
|
||||
|
||||
t = time.time()
|
||||
llm.generate(prompt_token_ids, sampling_params, use_tqdm=False)
|
||||
t = time.time() - t
|
||||
total_output_tokens = sum(sp.max_tokens for sp in sampling_params)
|
||||
total_output_tokens = num_seqs * output_len
|
||||
throughput = total_output_tokens / t
|
||||
print(f"[Decode] Output: {total_output_tokens}tok, Time: {t:.2f}s, Throughput: {throughput:.2f}tok/s")
|
||||
print(f"[Decode] Input: {num_seqs}x{input_len}tok, Output: {total_output_tokens}tok, Time: {t:.2f}s, Throughput: {throughput:.2f}tok/s")
|
||||
|
||||
|
||||
def bench_prefill(llm, num_seqs, input_len):
|
||||
@@ -34,8 +34,10 @@ def bench_prefill(llm, num_seqs, input_len):
|
||||
|
||||
|
||||
def main():
|
||||
path = os.path.expanduser("~/models/Qwen3-4B-Instruct-2507/")
|
||||
llm = LLM(path, enforce_eager=False, max_model_len=4096)
|
||||
path = os.path.expanduser("~/models/Qwen3-0.6B/")
|
||||
# Note: Qwen3-0.6B max_position_embeddings = 40960, cannot exceed this
|
||||
max_len = 40960
|
||||
llm = LLM(path, enforce_eager=False, max_model_len=max_len, max_num_batched_tokens=max_len)
|
||||
|
||||
# Warmup
|
||||
llm.generate(["Benchmark: "], SamplingParams())
|
||||
@@ -45,15 +47,15 @@ def main():
|
||||
print("=" * 60)
|
||||
# bench_prefill(llm, num_seqs=1, input_len=1024)
|
||||
# bench_prefill(llm, num_seqs=1, input_len=2048)
|
||||
bench_prefill(llm, num_seqs=1, input_len=4095)
|
||||
bench_prefill(llm, num_seqs=1, input_len=max_len - 1)
|
||||
# bench_prefill(llm, num_seqs=16, input_len=1024)
|
||||
# bench_prefill(llm, num_seqs=64, input_len=1024)
|
||||
|
||||
print("=" * 60)
|
||||
print("Decode Benchmark")
|
||||
print("=" * 60)
|
||||
# bench_decode(llm, num_seqs=1, max_input_len=1024, max_output_len=1024)
|
||||
bench_decode(llm, num_seqs=1, max_input_len=4072, max_output_len=16)
|
||||
# bench_decode(llm, num_seqs=1, input_len=1024, output_len=1024)
|
||||
bench_decode(llm, num_seqs=1, input_len=max_len - 128, output_len=128) # input + output <= max_len
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
@@ -9,18 +9,18 @@ from nanovllm.kvcache.sparse.hybrid import HybridPolicy
|
||||
from nanovllm.kvcache.sparse.full_policy import FullAttentionPolicy
|
||||
|
||||
|
||||
def bench_decode(llm, num_seqs, input_len, max_output_len):
|
||||
def bench_decode(llm, num_seqs, input_len, output_len):
|
||||
"""Benchmark decode performance (original test)"""
|
||||
seed(0)
|
||||
prompt_token_ids = [[randint(0, 10000) for _ in range(input_len)] for _ in range(num_seqs)]
|
||||
sampling_params = [SamplingParams(temperature=0.6, ignore_eos=True, max_tokens=max_output_len) for _ in range(num_seqs)]
|
||||
sampling_params = SamplingParams(temperature=0.6, ignore_eos=True, max_tokens=output_len)
|
||||
|
||||
t = time.time()
|
||||
llm.generate(prompt_token_ids, sampling_params, use_tqdm=False)
|
||||
t = time.time() - t
|
||||
total_output_tokens = sum(sp.max_tokens for sp in sampling_params)
|
||||
total_output_tokens = num_seqs * output_len
|
||||
throughput = total_output_tokens / t
|
||||
print(f"[Decode] Output: {total_output_tokens}tok, Time: {t:.2f}s, Throughput: {throughput:.2f}tok/s")
|
||||
print(f"[Decode] Input: {num_seqs}x{input_len}tok, Output: {total_output_tokens}tok, Time: {t:.2f}s, Throughput: {throughput:.2f}tok/s")
|
||||
|
||||
|
||||
def bench_prefill(llm, num_seqs, input_len):
|
||||
@@ -95,18 +95,20 @@ def main():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--no-sparse", action="store_true", help="Disable sparse attention (baseline)")
|
||||
parser.add_argument("--topk", type=int, default=8, help="Top-K blocks for Quest")
|
||||
parser.add_argument("--input-len", type=int, default=128 * 1024, help="Input length in tokens")
|
||||
parser.add_argument("--input-len", type=int, default=None, help="Input length in tokens (default: max_len - 1 for prefill, max_len - output_len for decode)")
|
||||
parser.add_argument("--output-len", type=int, default=128, help="Output length in tokens")
|
||||
args = parser.parse_args()
|
||||
|
||||
path = os.path.expanduser("~/models/Qwen3-4B-Instruct-2507/")
|
||||
path = os.path.expanduser("~/models/Qwen3-0.6B/")
|
||||
# Note: Qwen3-0.6B max_position_embeddings = 40960, cannot exceed this
|
||||
max_len = 40960
|
||||
llm = LLM(
|
||||
path,
|
||||
enforce_eager=False,
|
||||
max_model_len=256 * 1024,
|
||||
max_num_batched_tokens=256 * 1024,
|
||||
max_model_len=max_len,
|
||||
max_num_batched_tokens=max_len,
|
||||
enable_cpu_offload=True,
|
||||
num_gpu_blocks=120,
|
||||
num_gpu_blocks=8, # Small GPU buffer for offload testing
|
||||
num_prefetch_blocks=4,
|
||||
)
|
||||
|
||||
@@ -120,15 +122,19 @@ def main():
|
||||
# Warmup
|
||||
llm.generate(["Benchmark: "], SamplingParams())
|
||||
|
||||
# Default input lengths based on max_len
|
||||
prefill_input_len = args.input_len if args.input_len else max_len - 1
|
||||
decode_input_len = args.input_len if args.input_len else max_len - args.output_len
|
||||
|
||||
print("=" * 60)
|
||||
print("Prefill Benchmark (CPU Offload)")
|
||||
print("=" * 60)
|
||||
bench_prefill(llm, num_seqs=1, input_len=args.input_len)
|
||||
bench_prefill(llm, num_seqs=1, input_len=prefill_input_len)
|
||||
|
||||
print("=" * 60)
|
||||
print("Decode Benchmark (CPU Offload)")
|
||||
print("=" * 60)
|
||||
bench_decode(llm, num_seqs=1, input_len=args.input_len, max_output_len=args.output_len)
|
||||
bench_decode(llm, num_seqs=1, input_len=decode_input_len, output_len=args.output_len)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
@@ -5,19 +5,19 @@ from random import randint, seed
|
||||
from vllm import LLM, SamplingParams
|
||||
|
||||
|
||||
def bench_decode(llm, num_seqs, max_input_len, max_output_len):
|
||||
def bench_decode(llm, num_seqs, input_len, output_len):
|
||||
"""Benchmark decode performance (original test)"""
|
||||
seed(0)
|
||||
prompt_token_ids = [[randint(0, 10000) for _ in range(randint(100, max_input_len))] for _ in range(num_seqs)]
|
||||
sampling_params = [SamplingParams(temperature=0.6, ignore_eos=True, max_tokens=randint(100, max_output_len)) for _ in range(num_seqs)]
|
||||
prompt_token_ids = [[randint(0, 10000) for _ in range(input_len)] for _ in range(num_seqs)]
|
||||
sampling_params = SamplingParams(temperature=0.6, ignore_eos=True, max_tokens=output_len)
|
||||
prompt_token_ids = [dict(prompt_token_ids=p) for p in prompt_token_ids]
|
||||
|
||||
t = time.time()
|
||||
llm.generate(prompt_token_ids, sampling_params, use_tqdm=False)
|
||||
t = time.time() - t
|
||||
total_output_tokens = sum(sp.max_tokens for sp in sampling_params)
|
||||
total_output_tokens = num_seqs * output_len
|
||||
throughput = total_output_tokens / t
|
||||
print(f"[Decode] Output: {total_output_tokens}tok, Time: {t:.2f}s, Throughput: {throughput:.2f}tok/s")
|
||||
print(f"[Decode] Input: {num_seqs}x{input_len}tok, Output: {total_output_tokens}tok, Time: {t:.2f}s, Throughput: {throughput:.2f}tok/s")
|
||||
|
||||
|
||||
def bench_prefill(llm, num_seqs, input_len):
|
||||
@@ -37,8 +37,10 @@ def bench_prefill(llm, num_seqs, input_len):
|
||||
|
||||
|
||||
def main():
|
||||
path = os.path.expanduser("~/models/Qwen3-4B-Instruct-2507/")
|
||||
llm = LLM(path, enforce_eager=False, max_model_len=4096, max_num_seqs=128, gpu_memory_utilization=0.9)
|
||||
path = os.path.expanduser("~/models/Qwen3-0.6B/")
|
||||
# Note: Qwen3-0.6B max_position_embeddings = 40960, cannot exceed this
|
||||
max_len = 40960
|
||||
llm = LLM(path, enforce_eager=False, max_model_len=max_len, max_num_seqs=128, gpu_memory_utilization=0.9)
|
||||
|
||||
# Warmup
|
||||
llm.generate([dict(prompt_token_ids=[0])], SamplingParams())
|
||||
@@ -46,17 +48,17 @@ def main():
|
||||
print("=" * 60)
|
||||
print("Prefill Benchmark")
|
||||
print("=" * 60)
|
||||
bench_prefill(llm, num_seqs=1, input_len=1024)
|
||||
# bench_prefill(llm, num_seqs=1, input_len=1024)
|
||||
# bench_prefill(llm, num_seqs=1, input_len=2048)
|
||||
# bench_prefill(llm, num_seqs=1, input_len=4095)
|
||||
bench_prefill(llm, num_seqs=1, input_len=max_len - 1)
|
||||
# bench_prefill(llm, num_seqs=16, input_len=1024)
|
||||
# bench_prefill(llm, num_seqs=64, input_len=1024)
|
||||
|
||||
print("=" * 60)
|
||||
print("Decode Benchmark")
|
||||
print("=" * 60)
|
||||
bench_decode(llm, num_seqs=1, max_input_len=1024, max_output_len=1024)
|
||||
# bench_decode(llm, num_seqs=256, max_input_len=1024, max_output_len=1024)
|
||||
# bench_decode(llm, num_seqs=1, input_len=1024, output_len=1024)
|
||||
bench_decode(llm, num_seqs=1, input_len=max_len - 128, output_len=128) # input + output <= max_len
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
Reference in New Issue
Block a user