diff --git a/.claude/cclsp.json b/.claude/cclsp.json new file mode 100644 index 0000000..ec70ecd --- /dev/null +++ b/.claude/cclsp.json @@ -0,0 +1,59 @@ +{ + "servers": [ + { + "extensions": [ + "py", + "pyi" + ], + "command": [ + "uvx", + "--from", + "python-lsp-server", + "pylsp" + ], + "rootDir": ".", + "restartInterval": 5, + "initializationOptions": { + "settings": { + "pylsp": { + "plugins": { + "jedi_completion": { + "enabled": true + }, + "jedi_definition": { + "enabled": true + }, + "jedi_hover": { + "enabled": true + }, + "jedi_references": { + "enabled": true + }, + "jedi_signature_help": { + "enabled": true + }, + "jedi_symbols": { + "enabled": true + }, + "pylint": { + "enabled": false + }, + "pycodestyle": { + "enabled": false + }, + "pyflakes": { + "enabled": false + }, + "yapf": { + "enabled": false + }, + "rope_completion": { + "enabled": false + } + } + } + } + } + } + ] +} \ No newline at end of file diff --git a/CLAUDE.md b/CLAUDE.md index 12651ad..935c01c 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -235,3 +235,62 @@ Warmup uses a reasonable sequence length (`block_size * 2`) instead of `max_mode | `gpu_memory_utilization` | 0.9 | GPU memory fraction for KV cache | | `enforce_eager` | False | Disable CUDA graphs if True | | `num_prefetch_blocks` | 2 | Ring buffer pipeline depth (deprecated, uses num_gpu_blocks) | + +## Benchmarking + +### Benchmark Files + +| File | Purpose | Key Parameters | +|------|---------|----------------| +| `bench.py` | Standard GPU benchmark | Pure GPU inference | +| `bench_offload.py` | CPU offload benchmark | `enable_cpu_offload=True`, `num_gpu_blocks=8` | +| `bench_vllm.py` | vLLM comparison | Uses vLLM API for baseline comparison | + +### Current Test Configuration + +All benchmark files are aligned to use: +- **Model**: `~/models/Qwen3-0.6B/` +- **max_model_len**: 40960 (limited by model's `max_position_embeddings`) +- **Prefill test**: input_len = max_len - 1 (40959 tokens) +- **Decode test**: input_len = max_len - 128, output_len = 128 + +### Common Issues and Solutions + +**1. `max_num_batched_tokens` assertion error** +``` +AssertionError: assert self.max_num_batched_tokens >= self.max_model_len +``` +**Solution**: Set `max_num_batched_tokens=max_model_len` when using large context lengths. + +**2. CUDA graph block_tables dimension mismatch** +``` +RuntimeError: The expanded size of the tensor (1) must match the existing size (2) +``` +**Cause**: `input_len + output_len > max_model_len` causes more blocks than pre-allocated in CUDA graph. +**Solution**: Ensure `input_len + output_len <= max_model_len`. + +**3. RoPE position embedding out of bounds** +``` +Assertion `index out of bounds: 0 <= ... < 40960` failed +``` +**Cause**: Sequence length exceeds model's `max_position_embeddings`. +**Solution**: Check model's `config.json` for `max_position_embeddings` and limit `max_model_len` accordingly. + +### Model Context Length Limits + +| Model | max_position_embeddings | Notes | +|-------|------------------------|-------| +| Qwen3-0.6B | 40960 | ~40K context | +| Qwen3-4B | 40960 | ~40K context | +| Qwen2.5-7B-Instruct-1M | 1048576 | 1M context | + +**Important**: Always check `max_position_embeddings` in `config.json` before setting `max_model_len`. + +### Performance Reference (Qwen3-0.6B, 40K context) + +| Mode | Prefill (tok/s) | Decode (tok/s) | +|------|-----------------|----------------| +| GPU (bench.py) | ~18,000 | ~100 | +| CPU Offload (bench_offload.py) | ~7,200 | ~3.5 | + +CPU offload trades performance for memory efficiency, enabling long-context inference on limited GPU memory. diff --git a/bench.py b/bench.py index ac2c42a..0c7ebb5 100644 --- a/bench.py +++ b/bench.py @@ -4,18 +4,18 @@ from random import randint, seed from nanovllm import LLM, SamplingParams -def bench_decode(llm, num_seqs, max_input_len, max_output_len): +def bench_decode(llm, num_seqs, input_len, output_len): """Benchmark decode performance (original test)""" seed(0) - prompt_token_ids = [[randint(0, 10000) for _ in range(randint(100, max_input_len))] for _ in range(num_seqs)] - sampling_params = [SamplingParams(temperature=0.6, ignore_eos=True, max_tokens=randint(100, max_output_len)) for _ in range(num_seqs)] + prompt_token_ids = [[randint(0, 10000) for _ in range(input_len)] for _ in range(num_seqs)] + sampling_params = SamplingParams(temperature=0.6, ignore_eos=True, max_tokens=output_len) t = time.time() llm.generate(prompt_token_ids, sampling_params, use_tqdm=False) t = time.time() - t - total_output_tokens = sum(sp.max_tokens for sp in sampling_params) + total_output_tokens = num_seqs * output_len throughput = total_output_tokens / t - print(f"[Decode] Output: {total_output_tokens}tok, Time: {t:.2f}s, Throughput: {throughput:.2f}tok/s") + print(f"[Decode] Input: {num_seqs}x{input_len}tok, Output: {total_output_tokens}tok, Time: {t:.2f}s, Throughput: {throughput:.2f}tok/s") def bench_prefill(llm, num_seqs, input_len): @@ -34,8 +34,10 @@ def bench_prefill(llm, num_seqs, input_len): def main(): - path = os.path.expanduser("~/models/Qwen3-4B-Instruct-2507/") - llm = LLM(path, enforce_eager=False, max_model_len=4096) + path = os.path.expanduser("~/models/Qwen3-0.6B/") + # Note: Qwen3-0.6B max_position_embeddings = 40960, cannot exceed this + max_len = 40960 + llm = LLM(path, enforce_eager=False, max_model_len=max_len, max_num_batched_tokens=max_len) # Warmup llm.generate(["Benchmark: "], SamplingParams()) @@ -45,15 +47,15 @@ def main(): print("=" * 60) # bench_prefill(llm, num_seqs=1, input_len=1024) # bench_prefill(llm, num_seqs=1, input_len=2048) - bench_prefill(llm, num_seqs=1, input_len=4095) + bench_prefill(llm, num_seqs=1, input_len=max_len - 1) # bench_prefill(llm, num_seqs=16, input_len=1024) # bench_prefill(llm, num_seqs=64, input_len=1024) print("=" * 60) print("Decode Benchmark") print("=" * 60) - # bench_decode(llm, num_seqs=1, max_input_len=1024, max_output_len=1024) - bench_decode(llm, num_seqs=1, max_input_len=4072, max_output_len=16) + # bench_decode(llm, num_seqs=1, input_len=1024, output_len=1024) + bench_decode(llm, num_seqs=1, input_len=max_len - 128, output_len=128) # input + output <= max_len if __name__ == "__main__": diff --git a/bench_offload.py b/bench_offload.py index 8055141..2863a11 100644 --- a/bench_offload.py +++ b/bench_offload.py @@ -9,18 +9,18 @@ from nanovllm.kvcache.sparse.hybrid import HybridPolicy from nanovllm.kvcache.sparse.full_policy import FullAttentionPolicy -def bench_decode(llm, num_seqs, input_len, max_output_len): +def bench_decode(llm, num_seqs, input_len, output_len): """Benchmark decode performance (original test)""" seed(0) prompt_token_ids = [[randint(0, 10000) for _ in range(input_len)] for _ in range(num_seqs)] - sampling_params = [SamplingParams(temperature=0.6, ignore_eos=True, max_tokens=max_output_len) for _ in range(num_seqs)] + sampling_params = SamplingParams(temperature=0.6, ignore_eos=True, max_tokens=output_len) t = time.time() llm.generate(prompt_token_ids, sampling_params, use_tqdm=False) t = time.time() - t - total_output_tokens = sum(sp.max_tokens for sp in sampling_params) + total_output_tokens = num_seqs * output_len throughput = total_output_tokens / t - print(f"[Decode] Output: {total_output_tokens}tok, Time: {t:.2f}s, Throughput: {throughput:.2f}tok/s") + print(f"[Decode] Input: {num_seqs}x{input_len}tok, Output: {total_output_tokens}tok, Time: {t:.2f}s, Throughput: {throughput:.2f}tok/s") def bench_prefill(llm, num_seqs, input_len): @@ -95,18 +95,20 @@ def main(): parser = argparse.ArgumentParser() parser.add_argument("--no-sparse", action="store_true", help="Disable sparse attention (baseline)") parser.add_argument("--topk", type=int, default=8, help="Top-K blocks for Quest") - parser.add_argument("--input-len", type=int, default=128 * 1024, help="Input length in tokens") + parser.add_argument("--input-len", type=int, default=None, help="Input length in tokens (default: max_len - 1 for prefill, max_len - output_len for decode)") parser.add_argument("--output-len", type=int, default=128, help="Output length in tokens") args = parser.parse_args() - path = os.path.expanduser("~/models/Qwen3-4B-Instruct-2507/") + path = os.path.expanduser("~/models/Qwen3-0.6B/") + # Note: Qwen3-0.6B max_position_embeddings = 40960, cannot exceed this + max_len = 40960 llm = LLM( path, enforce_eager=False, - max_model_len=256 * 1024, - max_num_batched_tokens=256 * 1024, + max_model_len=max_len, + max_num_batched_tokens=max_len, enable_cpu_offload=True, - num_gpu_blocks=120, + num_gpu_blocks=8, # Small GPU buffer for offload testing num_prefetch_blocks=4, ) @@ -120,15 +122,19 @@ def main(): # Warmup llm.generate(["Benchmark: "], SamplingParams()) + # Default input lengths based on max_len + prefill_input_len = args.input_len if args.input_len else max_len - 1 + decode_input_len = args.input_len if args.input_len else max_len - args.output_len + print("=" * 60) print("Prefill Benchmark (CPU Offload)") print("=" * 60) - bench_prefill(llm, num_seqs=1, input_len=args.input_len) + bench_prefill(llm, num_seqs=1, input_len=prefill_input_len) print("=" * 60) print("Decode Benchmark (CPU Offload)") print("=" * 60) - bench_decode(llm, num_seqs=1, input_len=args.input_len, max_output_len=args.output_len) + bench_decode(llm, num_seqs=1, input_len=decode_input_len, output_len=args.output_len) if __name__ == "__main__": diff --git a/bench_vllm.py b/bench_vllm.py index 8497f44..bf03609 100644 --- a/bench_vllm.py +++ b/bench_vllm.py @@ -5,19 +5,19 @@ from random import randint, seed from vllm import LLM, SamplingParams -def bench_decode(llm, num_seqs, max_input_len, max_output_len): +def bench_decode(llm, num_seqs, input_len, output_len): """Benchmark decode performance (original test)""" seed(0) - prompt_token_ids = [[randint(0, 10000) for _ in range(randint(100, max_input_len))] for _ in range(num_seqs)] - sampling_params = [SamplingParams(temperature=0.6, ignore_eos=True, max_tokens=randint(100, max_output_len)) for _ in range(num_seqs)] + prompt_token_ids = [[randint(0, 10000) for _ in range(input_len)] for _ in range(num_seqs)] + sampling_params = SamplingParams(temperature=0.6, ignore_eos=True, max_tokens=output_len) prompt_token_ids = [dict(prompt_token_ids=p) for p in prompt_token_ids] t = time.time() llm.generate(prompt_token_ids, sampling_params, use_tqdm=False) t = time.time() - t - total_output_tokens = sum(sp.max_tokens for sp in sampling_params) + total_output_tokens = num_seqs * output_len throughput = total_output_tokens / t - print(f"[Decode] Output: {total_output_tokens}tok, Time: {t:.2f}s, Throughput: {throughput:.2f}tok/s") + print(f"[Decode] Input: {num_seqs}x{input_len}tok, Output: {total_output_tokens}tok, Time: {t:.2f}s, Throughput: {throughput:.2f}tok/s") def bench_prefill(llm, num_seqs, input_len): @@ -37,8 +37,10 @@ def bench_prefill(llm, num_seqs, input_len): def main(): - path = os.path.expanduser("~/models/Qwen3-4B-Instruct-2507/") - llm = LLM(path, enforce_eager=False, max_model_len=4096, max_num_seqs=128, gpu_memory_utilization=0.9) + path = os.path.expanduser("~/models/Qwen3-0.6B/") + # Note: Qwen3-0.6B max_position_embeddings = 40960, cannot exceed this + max_len = 40960 + llm = LLM(path, enforce_eager=False, max_model_len=max_len, max_num_seqs=128, gpu_memory_utilization=0.9) # Warmup llm.generate([dict(prompt_token_ids=[0])], SamplingParams()) @@ -46,17 +48,17 @@ def main(): print("=" * 60) print("Prefill Benchmark") print("=" * 60) - bench_prefill(llm, num_seqs=1, input_len=1024) + # bench_prefill(llm, num_seqs=1, input_len=1024) # bench_prefill(llm, num_seqs=1, input_len=2048) - # bench_prefill(llm, num_seqs=1, input_len=4095) + bench_prefill(llm, num_seqs=1, input_len=max_len - 1) # bench_prefill(llm, num_seqs=16, input_len=1024) # bench_prefill(llm, num_seqs=64, input_len=1024) print("=" * 60) print("Decode Benchmark") print("=" * 60) - bench_decode(llm, num_seqs=1, max_input_len=1024, max_output_len=1024) - # bench_decode(llm, num_seqs=256, max_input_len=1024, max_output_len=1024) + # bench_decode(llm, num_seqs=1, input_len=1024, output_len=1024) + bench_decode(llm, num_seqs=1, input_len=max_len - 128, output_len=128) # input + output <= max_len if __name__ == "__main__":