[fix] fix bench*.py.
This commit is contained in:
59
.claude/cclsp.json
Normal file
59
.claude/cclsp.json
Normal file
@@ -0,0 +1,59 @@
|
|||||||
|
{
|
||||||
|
"servers": [
|
||||||
|
{
|
||||||
|
"extensions": [
|
||||||
|
"py",
|
||||||
|
"pyi"
|
||||||
|
],
|
||||||
|
"command": [
|
||||||
|
"uvx",
|
||||||
|
"--from",
|
||||||
|
"python-lsp-server",
|
||||||
|
"pylsp"
|
||||||
|
],
|
||||||
|
"rootDir": ".",
|
||||||
|
"restartInterval": 5,
|
||||||
|
"initializationOptions": {
|
||||||
|
"settings": {
|
||||||
|
"pylsp": {
|
||||||
|
"plugins": {
|
||||||
|
"jedi_completion": {
|
||||||
|
"enabled": true
|
||||||
|
},
|
||||||
|
"jedi_definition": {
|
||||||
|
"enabled": true
|
||||||
|
},
|
||||||
|
"jedi_hover": {
|
||||||
|
"enabled": true
|
||||||
|
},
|
||||||
|
"jedi_references": {
|
||||||
|
"enabled": true
|
||||||
|
},
|
||||||
|
"jedi_signature_help": {
|
||||||
|
"enabled": true
|
||||||
|
},
|
||||||
|
"jedi_symbols": {
|
||||||
|
"enabled": true
|
||||||
|
},
|
||||||
|
"pylint": {
|
||||||
|
"enabled": false
|
||||||
|
},
|
||||||
|
"pycodestyle": {
|
||||||
|
"enabled": false
|
||||||
|
},
|
||||||
|
"pyflakes": {
|
||||||
|
"enabled": false
|
||||||
|
},
|
||||||
|
"yapf": {
|
||||||
|
"enabled": false
|
||||||
|
},
|
||||||
|
"rope_completion": {
|
||||||
|
"enabled": false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
59
CLAUDE.md
59
CLAUDE.md
@@ -235,3 +235,62 @@ Warmup uses a reasonable sequence length (`block_size * 2`) instead of `max_mode
|
|||||||
| `gpu_memory_utilization` | 0.9 | GPU memory fraction for KV cache |
|
| `gpu_memory_utilization` | 0.9 | GPU memory fraction for KV cache |
|
||||||
| `enforce_eager` | False | Disable CUDA graphs if True |
|
| `enforce_eager` | False | Disable CUDA graphs if True |
|
||||||
| `num_prefetch_blocks` | 2 | Ring buffer pipeline depth (deprecated, uses num_gpu_blocks) |
|
| `num_prefetch_blocks` | 2 | Ring buffer pipeline depth (deprecated, uses num_gpu_blocks) |
|
||||||
|
|
||||||
|
## Benchmarking
|
||||||
|
|
||||||
|
### Benchmark Files
|
||||||
|
|
||||||
|
| File | Purpose | Key Parameters |
|
||||||
|
|------|---------|----------------|
|
||||||
|
| `bench.py` | Standard GPU benchmark | Pure GPU inference |
|
||||||
|
| `bench_offload.py` | CPU offload benchmark | `enable_cpu_offload=True`, `num_gpu_blocks=8` |
|
||||||
|
| `bench_vllm.py` | vLLM comparison | Uses vLLM API for baseline comparison |
|
||||||
|
|
||||||
|
### Current Test Configuration
|
||||||
|
|
||||||
|
All benchmark files are aligned to use:
|
||||||
|
- **Model**: `~/models/Qwen3-0.6B/`
|
||||||
|
- **max_model_len**: 40960 (limited by model's `max_position_embeddings`)
|
||||||
|
- **Prefill test**: input_len = max_len - 1 (40959 tokens)
|
||||||
|
- **Decode test**: input_len = max_len - 128, output_len = 128
|
||||||
|
|
||||||
|
### Common Issues and Solutions
|
||||||
|
|
||||||
|
**1. `max_num_batched_tokens` assertion error**
|
||||||
|
```
|
||||||
|
AssertionError: assert self.max_num_batched_tokens >= self.max_model_len
|
||||||
|
```
|
||||||
|
**Solution**: Set `max_num_batched_tokens=max_model_len` when using large context lengths.
|
||||||
|
|
||||||
|
**2. CUDA graph block_tables dimension mismatch**
|
||||||
|
```
|
||||||
|
RuntimeError: The expanded size of the tensor (1) must match the existing size (2)
|
||||||
|
```
|
||||||
|
**Cause**: `input_len + output_len > max_model_len` causes more blocks than pre-allocated in CUDA graph.
|
||||||
|
**Solution**: Ensure `input_len + output_len <= max_model_len`.
|
||||||
|
|
||||||
|
**3. RoPE position embedding out of bounds**
|
||||||
|
```
|
||||||
|
Assertion `index out of bounds: 0 <= ... < 40960` failed
|
||||||
|
```
|
||||||
|
**Cause**: Sequence length exceeds model's `max_position_embeddings`.
|
||||||
|
**Solution**: Check model's `config.json` for `max_position_embeddings` and limit `max_model_len` accordingly.
|
||||||
|
|
||||||
|
### Model Context Length Limits
|
||||||
|
|
||||||
|
| Model | max_position_embeddings | Notes |
|
||||||
|
|-------|------------------------|-------|
|
||||||
|
| Qwen3-0.6B | 40960 | ~40K context |
|
||||||
|
| Qwen3-4B | 40960 | ~40K context |
|
||||||
|
| Qwen2.5-7B-Instruct-1M | 1048576 | 1M context |
|
||||||
|
|
||||||
|
**Important**: Always check `max_position_embeddings` in `config.json` before setting `max_model_len`.
|
||||||
|
|
||||||
|
### Performance Reference (Qwen3-0.6B, 40K context)
|
||||||
|
|
||||||
|
| Mode | Prefill (tok/s) | Decode (tok/s) |
|
||||||
|
|------|-----------------|----------------|
|
||||||
|
| GPU (bench.py) | ~18,000 | ~100 |
|
||||||
|
| CPU Offload (bench_offload.py) | ~7,200 | ~3.5 |
|
||||||
|
|
||||||
|
CPU offload trades performance for memory efficiency, enabling long-context inference on limited GPU memory.
|
||||||
|
|||||||
22
bench.py
22
bench.py
@@ -4,18 +4,18 @@ from random import randint, seed
|
|||||||
from nanovllm import LLM, SamplingParams
|
from nanovllm import LLM, SamplingParams
|
||||||
|
|
||||||
|
|
||||||
def bench_decode(llm, num_seqs, max_input_len, max_output_len):
|
def bench_decode(llm, num_seqs, input_len, output_len):
|
||||||
"""Benchmark decode performance (original test)"""
|
"""Benchmark decode performance (original test)"""
|
||||||
seed(0)
|
seed(0)
|
||||||
prompt_token_ids = [[randint(0, 10000) for _ in range(randint(100, max_input_len))] for _ in range(num_seqs)]
|
prompt_token_ids = [[randint(0, 10000) for _ in range(input_len)] for _ in range(num_seqs)]
|
||||||
sampling_params = [SamplingParams(temperature=0.6, ignore_eos=True, max_tokens=randint(100, max_output_len)) for _ in range(num_seqs)]
|
sampling_params = SamplingParams(temperature=0.6, ignore_eos=True, max_tokens=output_len)
|
||||||
|
|
||||||
t = time.time()
|
t = time.time()
|
||||||
llm.generate(prompt_token_ids, sampling_params, use_tqdm=False)
|
llm.generate(prompt_token_ids, sampling_params, use_tqdm=False)
|
||||||
t = time.time() - t
|
t = time.time() - t
|
||||||
total_output_tokens = sum(sp.max_tokens for sp in sampling_params)
|
total_output_tokens = num_seqs * output_len
|
||||||
throughput = total_output_tokens / t
|
throughput = total_output_tokens / t
|
||||||
print(f"[Decode] Output: {total_output_tokens}tok, Time: {t:.2f}s, Throughput: {throughput:.2f}tok/s")
|
print(f"[Decode] Input: {num_seqs}x{input_len}tok, Output: {total_output_tokens}tok, Time: {t:.2f}s, Throughput: {throughput:.2f}tok/s")
|
||||||
|
|
||||||
|
|
||||||
def bench_prefill(llm, num_seqs, input_len):
|
def bench_prefill(llm, num_seqs, input_len):
|
||||||
@@ -34,8 +34,10 @@ def bench_prefill(llm, num_seqs, input_len):
|
|||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
path = os.path.expanduser("~/models/Qwen3-4B-Instruct-2507/")
|
path = os.path.expanduser("~/models/Qwen3-0.6B/")
|
||||||
llm = LLM(path, enforce_eager=False, max_model_len=4096)
|
# Note: Qwen3-0.6B max_position_embeddings = 40960, cannot exceed this
|
||||||
|
max_len = 40960
|
||||||
|
llm = LLM(path, enforce_eager=False, max_model_len=max_len, max_num_batched_tokens=max_len)
|
||||||
|
|
||||||
# Warmup
|
# Warmup
|
||||||
llm.generate(["Benchmark: "], SamplingParams())
|
llm.generate(["Benchmark: "], SamplingParams())
|
||||||
@@ -45,15 +47,15 @@ def main():
|
|||||||
print("=" * 60)
|
print("=" * 60)
|
||||||
# bench_prefill(llm, num_seqs=1, input_len=1024)
|
# bench_prefill(llm, num_seqs=1, input_len=1024)
|
||||||
# bench_prefill(llm, num_seqs=1, input_len=2048)
|
# bench_prefill(llm, num_seqs=1, input_len=2048)
|
||||||
bench_prefill(llm, num_seqs=1, input_len=4095)
|
bench_prefill(llm, num_seqs=1, input_len=max_len - 1)
|
||||||
# bench_prefill(llm, num_seqs=16, input_len=1024)
|
# bench_prefill(llm, num_seqs=16, input_len=1024)
|
||||||
# bench_prefill(llm, num_seqs=64, input_len=1024)
|
# bench_prefill(llm, num_seqs=64, input_len=1024)
|
||||||
|
|
||||||
print("=" * 60)
|
print("=" * 60)
|
||||||
print("Decode Benchmark")
|
print("Decode Benchmark")
|
||||||
print("=" * 60)
|
print("=" * 60)
|
||||||
# bench_decode(llm, num_seqs=1, max_input_len=1024, max_output_len=1024)
|
# bench_decode(llm, num_seqs=1, input_len=1024, output_len=1024)
|
||||||
bench_decode(llm, num_seqs=1, max_input_len=4072, max_output_len=16)
|
bench_decode(llm, num_seqs=1, input_len=max_len - 128, output_len=128) # input + output <= max_len
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|||||||
@@ -9,18 +9,18 @@ from nanovllm.kvcache.sparse.hybrid import HybridPolicy
|
|||||||
from nanovllm.kvcache.sparse.full_policy import FullAttentionPolicy
|
from nanovllm.kvcache.sparse.full_policy import FullAttentionPolicy
|
||||||
|
|
||||||
|
|
||||||
def bench_decode(llm, num_seqs, input_len, max_output_len):
|
def bench_decode(llm, num_seqs, input_len, output_len):
|
||||||
"""Benchmark decode performance (original test)"""
|
"""Benchmark decode performance (original test)"""
|
||||||
seed(0)
|
seed(0)
|
||||||
prompt_token_ids = [[randint(0, 10000) for _ in range(input_len)] for _ in range(num_seqs)]
|
prompt_token_ids = [[randint(0, 10000) for _ in range(input_len)] for _ in range(num_seqs)]
|
||||||
sampling_params = [SamplingParams(temperature=0.6, ignore_eos=True, max_tokens=max_output_len) for _ in range(num_seqs)]
|
sampling_params = SamplingParams(temperature=0.6, ignore_eos=True, max_tokens=output_len)
|
||||||
|
|
||||||
t = time.time()
|
t = time.time()
|
||||||
llm.generate(prompt_token_ids, sampling_params, use_tqdm=False)
|
llm.generate(prompt_token_ids, sampling_params, use_tqdm=False)
|
||||||
t = time.time() - t
|
t = time.time() - t
|
||||||
total_output_tokens = sum(sp.max_tokens for sp in sampling_params)
|
total_output_tokens = num_seqs * output_len
|
||||||
throughput = total_output_tokens / t
|
throughput = total_output_tokens / t
|
||||||
print(f"[Decode] Output: {total_output_tokens}tok, Time: {t:.2f}s, Throughput: {throughput:.2f}tok/s")
|
print(f"[Decode] Input: {num_seqs}x{input_len}tok, Output: {total_output_tokens}tok, Time: {t:.2f}s, Throughput: {throughput:.2f}tok/s")
|
||||||
|
|
||||||
|
|
||||||
def bench_prefill(llm, num_seqs, input_len):
|
def bench_prefill(llm, num_seqs, input_len):
|
||||||
@@ -95,18 +95,20 @@ def main():
|
|||||||
parser = argparse.ArgumentParser()
|
parser = argparse.ArgumentParser()
|
||||||
parser.add_argument("--no-sparse", action="store_true", help="Disable sparse attention (baseline)")
|
parser.add_argument("--no-sparse", action="store_true", help="Disable sparse attention (baseline)")
|
||||||
parser.add_argument("--topk", type=int, default=8, help="Top-K blocks for Quest")
|
parser.add_argument("--topk", type=int, default=8, help="Top-K blocks for Quest")
|
||||||
parser.add_argument("--input-len", type=int, default=128 * 1024, help="Input length in tokens")
|
parser.add_argument("--input-len", type=int, default=None, help="Input length in tokens (default: max_len - 1 for prefill, max_len - output_len for decode)")
|
||||||
parser.add_argument("--output-len", type=int, default=128, help="Output length in tokens")
|
parser.add_argument("--output-len", type=int, default=128, help="Output length in tokens")
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
path = os.path.expanduser("~/models/Qwen3-4B-Instruct-2507/")
|
path = os.path.expanduser("~/models/Qwen3-0.6B/")
|
||||||
|
# Note: Qwen3-0.6B max_position_embeddings = 40960, cannot exceed this
|
||||||
|
max_len = 40960
|
||||||
llm = LLM(
|
llm = LLM(
|
||||||
path,
|
path,
|
||||||
enforce_eager=False,
|
enforce_eager=False,
|
||||||
max_model_len=256 * 1024,
|
max_model_len=max_len,
|
||||||
max_num_batched_tokens=256 * 1024,
|
max_num_batched_tokens=max_len,
|
||||||
enable_cpu_offload=True,
|
enable_cpu_offload=True,
|
||||||
num_gpu_blocks=120,
|
num_gpu_blocks=8, # Small GPU buffer for offload testing
|
||||||
num_prefetch_blocks=4,
|
num_prefetch_blocks=4,
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -120,15 +122,19 @@ def main():
|
|||||||
# Warmup
|
# Warmup
|
||||||
llm.generate(["Benchmark: "], SamplingParams())
|
llm.generate(["Benchmark: "], SamplingParams())
|
||||||
|
|
||||||
|
# Default input lengths based on max_len
|
||||||
|
prefill_input_len = args.input_len if args.input_len else max_len - 1
|
||||||
|
decode_input_len = args.input_len if args.input_len else max_len - args.output_len
|
||||||
|
|
||||||
print("=" * 60)
|
print("=" * 60)
|
||||||
print("Prefill Benchmark (CPU Offload)")
|
print("Prefill Benchmark (CPU Offload)")
|
||||||
print("=" * 60)
|
print("=" * 60)
|
||||||
bench_prefill(llm, num_seqs=1, input_len=args.input_len)
|
bench_prefill(llm, num_seqs=1, input_len=prefill_input_len)
|
||||||
|
|
||||||
print("=" * 60)
|
print("=" * 60)
|
||||||
print("Decode Benchmark (CPU Offload)")
|
print("Decode Benchmark (CPU Offload)")
|
||||||
print("=" * 60)
|
print("=" * 60)
|
||||||
bench_decode(llm, num_seqs=1, input_len=args.input_len, max_output_len=args.output_len)
|
bench_decode(llm, num_seqs=1, input_len=decode_input_len, output_len=args.output_len)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|||||||
@@ -5,19 +5,19 @@ from random import randint, seed
|
|||||||
from vllm import LLM, SamplingParams
|
from vllm import LLM, SamplingParams
|
||||||
|
|
||||||
|
|
||||||
def bench_decode(llm, num_seqs, max_input_len, max_output_len):
|
def bench_decode(llm, num_seqs, input_len, output_len):
|
||||||
"""Benchmark decode performance (original test)"""
|
"""Benchmark decode performance (original test)"""
|
||||||
seed(0)
|
seed(0)
|
||||||
prompt_token_ids = [[randint(0, 10000) for _ in range(randint(100, max_input_len))] for _ in range(num_seqs)]
|
prompt_token_ids = [[randint(0, 10000) for _ in range(input_len)] for _ in range(num_seqs)]
|
||||||
sampling_params = [SamplingParams(temperature=0.6, ignore_eos=True, max_tokens=randint(100, max_output_len)) for _ in range(num_seqs)]
|
sampling_params = SamplingParams(temperature=0.6, ignore_eos=True, max_tokens=output_len)
|
||||||
prompt_token_ids = [dict(prompt_token_ids=p) for p in prompt_token_ids]
|
prompt_token_ids = [dict(prompt_token_ids=p) for p in prompt_token_ids]
|
||||||
|
|
||||||
t = time.time()
|
t = time.time()
|
||||||
llm.generate(prompt_token_ids, sampling_params, use_tqdm=False)
|
llm.generate(prompt_token_ids, sampling_params, use_tqdm=False)
|
||||||
t = time.time() - t
|
t = time.time() - t
|
||||||
total_output_tokens = sum(sp.max_tokens for sp in sampling_params)
|
total_output_tokens = num_seqs * output_len
|
||||||
throughput = total_output_tokens / t
|
throughput = total_output_tokens / t
|
||||||
print(f"[Decode] Output: {total_output_tokens}tok, Time: {t:.2f}s, Throughput: {throughput:.2f}tok/s")
|
print(f"[Decode] Input: {num_seqs}x{input_len}tok, Output: {total_output_tokens}tok, Time: {t:.2f}s, Throughput: {throughput:.2f}tok/s")
|
||||||
|
|
||||||
|
|
||||||
def bench_prefill(llm, num_seqs, input_len):
|
def bench_prefill(llm, num_seqs, input_len):
|
||||||
@@ -37,8 +37,10 @@ def bench_prefill(llm, num_seqs, input_len):
|
|||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
path = os.path.expanduser("~/models/Qwen3-4B-Instruct-2507/")
|
path = os.path.expanduser("~/models/Qwen3-0.6B/")
|
||||||
llm = LLM(path, enforce_eager=False, max_model_len=4096, max_num_seqs=128, gpu_memory_utilization=0.9)
|
# Note: Qwen3-0.6B max_position_embeddings = 40960, cannot exceed this
|
||||||
|
max_len = 40960
|
||||||
|
llm = LLM(path, enforce_eager=False, max_model_len=max_len, max_num_seqs=128, gpu_memory_utilization=0.9)
|
||||||
|
|
||||||
# Warmup
|
# Warmup
|
||||||
llm.generate([dict(prompt_token_ids=[0])], SamplingParams())
|
llm.generate([dict(prompt_token_ids=[0])], SamplingParams())
|
||||||
@@ -46,17 +48,17 @@ def main():
|
|||||||
print("=" * 60)
|
print("=" * 60)
|
||||||
print("Prefill Benchmark")
|
print("Prefill Benchmark")
|
||||||
print("=" * 60)
|
print("=" * 60)
|
||||||
bench_prefill(llm, num_seqs=1, input_len=1024)
|
# bench_prefill(llm, num_seqs=1, input_len=1024)
|
||||||
# bench_prefill(llm, num_seqs=1, input_len=2048)
|
# bench_prefill(llm, num_seqs=1, input_len=2048)
|
||||||
# bench_prefill(llm, num_seqs=1, input_len=4095)
|
bench_prefill(llm, num_seqs=1, input_len=max_len - 1)
|
||||||
# bench_prefill(llm, num_seqs=16, input_len=1024)
|
# bench_prefill(llm, num_seqs=16, input_len=1024)
|
||||||
# bench_prefill(llm, num_seqs=64, input_len=1024)
|
# bench_prefill(llm, num_seqs=64, input_len=1024)
|
||||||
|
|
||||||
print("=" * 60)
|
print("=" * 60)
|
||||||
print("Decode Benchmark")
|
print("Decode Benchmark")
|
||||||
print("=" * 60)
|
print("=" * 60)
|
||||||
bench_decode(llm, num_seqs=1, max_input_len=1024, max_output_len=1024)
|
# bench_decode(llm, num_seqs=1, input_len=1024, output_len=1024)
|
||||||
# bench_decode(llm, num_seqs=256, max_input_len=1024, max_output_len=1024)
|
bench_decode(llm, num_seqs=1, input_len=max_len - 128, output_len=128) # input + output <= max_len
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|||||||
Reference in New Issue
Block a user