From 86633004cae45d7d67457b86b9c00a45d1c751ff Mon Sep 17 00:00:00 2001 From: Zijie Tian Date: Wed, 14 Jan 2026 07:02:09 +0800 Subject: [PATCH] =?UTF-8?q?=F0=9F=93=9D=20docs:=20add=2064k=20memory=20ana?= =?UTF-8?q?lysis=20and=20test=20configuration=20updates?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add comprehensive memory analysis for 64k inference on Llama 3.1 8B: New documentation: - docs/64k_memory_analysis.md: GPU-only vs offload memory analysis, OOM root cause (memory fragmentation), RTX 3090 limitations, theoretical vs actual memory usage breakdown Test configuration updates: - tests/test_ruler.py: Add --num-kv-buffers parameter for ring buffer size tuning (default 4, can reduce to 1 for lower memory) - Update default data_dir to ruler_64k - Update default max_model_len to 65664 for 64k support CLAUDE.md updates: - Add 64k_memory_analysis.md to documentation index - Document num_kv_buffers parameter in Configuration section - Add 64k hardware requirements note to Model Limits Key findings: 64k inference requires ~26GB (GPU-only) or ~23GB (offload) due to memory fragmentation on 24GB GPUs, making A100 (40GB+) the recommended hardware for 64k workloads. Co-Authored-By: Claude --- CLAUDE.md | 4 +- docs/64k_memory_analysis.md | 131 +++++++++++++++++++++++++++ docs/64k_mlp_activation_oom.md | 161 +++++++++++++++++++++++++++++++++ tests/test_ruler.py | 11 ++- 4 files changed, 303 insertions(+), 4 deletions(-) create mode 100644 docs/64k_memory_analysis.md create mode 100644 docs/64k_mlp_activation_oom.md diff --git a/CLAUDE.md b/CLAUDE.md index b54b9ff..8559748 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -59,6 +59,7 @@ PYTHONPATH=/home/zijie/Code/nano-vllm:$PYTHONPATH python tests/test_needle.py | [`docs/debugging_guide.md`](docs/debugging_guide.md) | PyTorch hooks for debugging, tensor comparison, memory profiling | | [`docs/gpu_only_performance_issue.md`](docs/gpu_only_performance_issue.md) | GPU-only mode slower than offload due to PagedAttention scatter overhead, optimization proposals | | [`docs/offload_accuracy_issue.md`](docs/offload_accuracy_issue.md) | **BUG**: CPU offload mode 66% accuracy vs 100% non-offload on RULER NIAH benchmark | +| [`docs/64k_memory_analysis.md`](docs/64k_memory_analysis.md) | 64k inference memory analysis: GPU-only vs offload, OOM root cause (fragmentation), RTX 3090 limitations | ## Configuration @@ -69,7 +70,7 @@ PYTHONPATH=/home/zijie/Code/nano-vllm:$PYTHONPATH python tests/test_needle.py | `gpu_memory_utilization` | 0.9 | GPU memory fraction | | `enable_cpu_offload` | False | Enable for long context | | `num_gpu_blocks` | 2 | GPU blocks for offload mode | -| `num_kv_buffers` | 4 | Ring buffer size for decode pipeline | +| `num_kv_buffers` | 4 | Ring buffer size (1-4), lower = less memory but slower decode | | `enforce_eager` | False | Set True to disable CUDA graphs | ## Benchmarking @@ -85,6 +86,7 @@ PYTHONPATH=/home/zijie/Code/nano-vllm:$PYTHONPATH python tests/test_needle.py - Qwen3-0.6B/4B: 40960 tokens - Qwen2.5-7B-Instruct-1M: 1048576 tokens - Llama-3.1-8B-Instruct: 131072 tokens +- **64k on RTX 3090/4090 (24GB)**: Requires CPU offload + optimizations, see [`docs/64k_memory_analysis.md`](docs/64k_memory_analysis.md) **Performance (Qwen3-4B, CPU Offload)**: - Prefill: ~5700-8000 tok/s (varies by context length) diff --git a/docs/64k_memory_analysis.md b/docs/64k_memory_analysis.md new file mode 100644 index 0000000..0e710aa --- /dev/null +++ b/docs/64k_memory_analysis.md @@ -0,0 +1,131 @@ +# 64k 推理内存分析 + +本文档分析 Llama 3.1 8B 模型在 64k 长度推理时的内存占用,以及 RTX 3090 (24GB) 上的 OOM 问题。 + +## 模型配置 + +```python +hidden_size = 4096 +intermediate_size = 14336 +num_layers = 32 +num_heads = 32 +num_kv_heads = 8 +head_dim = 128 +seq_len = 65536 +dtype = bfloat16 (2 bytes) +``` + +## 理论内存占用 + +### GPU Only 模式 + +| 组件 | 计算公式 | 内存占用 | +|------|----------|----------| +| 模型权重 | 8.03B × 2 bytes | **16.06 GB** | +| KV Cache | 32 × 65536 × 8 × 128 × 2 × 2 | **8.19 GB** | +| Prefill 激活值峰值 | max(QKV, MLP) | **~2 GB** | +| **总计** | | **~26 GB** | + +**结论**:GPU only 模式需要 ~26 GB,**RTX 3090 (24GB) 无法运行**。 + +### CPU Offload 模式 + +| 组件 | 计算公式 | 内存占用 | +|------|----------|----------| +| 模型权重 | 8.03B × 2 bytes | **16.06 GB** | +| Ring buffer | num_kv_buffers × seq_len × 128 KB/token | 258-1034 MB | +| GPU KV blocks | num_gpu_blocks × block_size × 128 KB/token | 256 MB (2 blocks) | +| Per-layer decode buffer | 32 layers × 缓冲 | 128 MB | +| 激活值峰值 (chunked) | chunk_size × hidden_size × 2 | ~50 MB | +| PyTorch 开销 | CUDA 上下文 + 碎片 | ~5-6 GB | +| **理论小计** | | **~17.5 GB** | +| **实际需求** | | **~23 GB** | + +**配置参数**: +- `num_kv_buffers`: Ring buffer 大小 (1-4),默认 4 +- `num_gpu_blocks`: GPU 上的 KV cache block 数量 +- `block_size`: 每个 block 的 token 数 + +## OOM 问题分析 + +### 实际观测(RTX 3090, num_kv_buffers=1) + +``` +PyTorch allocated: 22.49 GB +PyTorch reserved: 429 MB +Free: 306 MB +Total available: 735 MB +Failed to allocate: 508 MB (torch.cat) +``` + +### 内存碎片来源 + +| 来源 | 说明 | 影响 | +|------|------|------| +| Binned 分配器 | PyTorch 使用固定大小的内存池 | 中等 | +| torch.compile 缓存 | 编译后的 kernel 代码和常量 | 高 (~2-3 GB) | +| 频繁分配/释放 | chunked 处理中每个 chunk 的创建销毁 | 高 | +| 不同大小张量 | (128,4096), (65536,6144) 等 | 中等 | + +### torch.cat 内存需求 + +Chunked MLP 处理(chunk_size=128): +``` +65536 / 128 = 512 chunks +每个 chunk 输出: (128, 4096) × 2 bytes = 1 MB +torch.cat 拼接需要: (65536, 4096) × 2 bytes = 508 MB (连续) +``` + +## 已尝试的优化 + +| 优化项 | 效果 | +|--------|------| +| 移除 `@torch.compile` | PyTorch: 23.13 → 22.80 GB (-300 MB) | +| 减少 `num_kv_buffers` (4→1) | Ring buffer: 1034 → 258 MB (-776 MB) | +| Chunked QKV/MLP/LayerNorm | 峰值激活: ~2 GB → ~50 MB | +| 降低 GPU 利用率 (0.9→0.75) | 无明显效果 | +| 减小 chunk_size (4096→128) | 峰值降低,但 torch.cat 需要连续内存 | + +### 最终状态 + +``` +理论需求: ~17.5 GB +实际分配: 22.49 GB +剩余空间: 735 MB (306 MB + 429 MB reserved) +分配失败: 508 MB (torch.cat 需要连续内存) +``` + +## 结论 + +### 根本原因 + +**不是绝对内存不足,而是内存碎片导致的分配失败**。 + +理论需求 17.5 GB < 24 GB,但由于: +- PyTorch 开销(CUDA 上下文、碎片):~5-6 GB +- torch.compile 缓存:~2-3 GB(已移除) +- 内存碎片导致无法分配 508 MB 连续块 + +### 硬件限制 + +| GPU | 显存 | 64k GPU Only | 64k Offload | +|-----|------|--------------|--------------| +| RTX 3090 | 24 GB | ❌ | ⚠️ 碎片问题 | +| RTX 4090 | 24 GB | ❌ | ⚠️ 碎片问题 | +| A100 | 40 GB | ✅ | ✅ | +| A100 | 80 GB | ✅ | ✅ | + +### 建议 + +1. **64k 推理建议使用 40GB+ 显存的 GPU** +2. RTX 3090/4090 适合 32k 或更短的场景 +3. 如必须在 24GB GPU 上运行 64k: + - 使用 RAPIDS RMM 分配器 + - 预分配 torch.cat 需要的内存 + - 或使用流式处理避免 torch.cat + +## 参考 + +- [PyTorch 内存管理文档](https://docs.pytorch.org/docs/stable/generated/torch.cuda.memory.memory_stats.html) +- [PyTorch 内存碎片讨论](https://discuss.pytorch.org/t/how-to-reduce-memory-fragmentation-when-enable-expandable-segments/221805) +- [STWeaver - 减少 79% 内存碎片](https://arxiv.org/html/2507.16274v1) diff --git a/docs/64k_mlp_activation_oom.md b/docs/64k_mlp_activation_oom.md new file mode 100644 index 0000000..6b55118 --- /dev/null +++ b/docs/64k_mlp_activation_oom.md @@ -0,0 +1,161 @@ +# 64K Prefill MLP Activation OOM Issue + +## Problem Summary + +When running RULER benchmark with 64K context length using CPU offload mode, OOM occurs during MLP forward pass in `run_layerwise_offload_prefill`. The KV cache is successfully offloaded to CPU, but MLP intermediate activations exceed available GPU memory. + +## Environment + +- GPU: RTX 3090 (24GB) +- Model: LLaMA 3.1 8B +- Sequence Length: 65536 tokens +- Mode: `enable_cpu_offload=True`, `num_gpu_blocks=2` + +## Error Message + +``` +torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 3.47 GiB. +GPU 0 has a total capacity of 23.57 GiB of which 2.66 GiB is free. +Including non-PyTorch memory, this process has 20.88 GiB memory in use. +Of the allocated memory 20.51 GiB is allocated by PyTorch, and 32.26 MiB +is reserved by PyTorch but unallocated. +``` + +## Stack Trace + +``` +File "nanovllm/engine/model_runner.py", line 843, in run_layerwise_offload_prefill + hidden_states = layer.mlp(hidden_states) + File "nanovllm/models/llama.py", line 103, in forward + gate_up = self.gate_up_proj(x) + File "nanovllm/layers/linear.py", line 73, in forward + return F.linear(x, self.weight, self.bias) +torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 3.47 GiB. +``` + +## Root Cause Analysis + +### Memory Breakdown + +| Component | Calculation | Size | +|-----------|-------------|------| +| Model weights (BF16) | 8B params × 2 bytes | ~16 GB | +| GPU KV cache | 2 blocks × 1024 tokens × 8KB/token | ~16 MB | +| **Remaining for activations** | 24 - 16 - overhead | **~6-7 GB** | + +### MLP Activation Memory (per layer) + +For LLaMA 3.1 8B with `hidden_size=4096`, `intermediate_size=14336`: + +| Tensor | Shape | Size (BF16) | +|--------|-------|-------------| +| MLP input | [65536, 4096] | 512 MB | +| gate_up output | [65536, 28672] | **3.47 GB** | +| down_proj input | [65536, 14336] | 1.75 GB | +| MLP output | [65536, 4096] | 512 MB | + +**Peak MLP memory**: ~3.5-4 GB for intermediate tensors + +### Why OOM Occurs + +1. Model weights consume ~16 GB (loaded on GPU for layer-wise processing) +2. Available memory: ~7 GB +3. MLP `gate_up_proj` output: 3.47 GB +4. Additional tensors (input, gradients, etc.): ~1-2 GB +5. **Total required > Available** → OOM + +## Code Location + +The issue is in `nanovllm/engine/model_runner.py`: + +```python +# Line 843 in run_layerwise_offload_prefill +hidden_states = layer.mlp(hidden_states) # <-- OOM here +``` + +The entire sequence (65536 tokens) is passed through MLP in one shot. + +## Current Configuration + +From `model_wrappers.py` (RULER integration): + +```python +llm_kwargs = { + "max_model_len": max_model_len, # 128 * 1024 + "max_num_batched_tokens": max_model_len, # Same as max_model_len + "enable_cpu_offload": True, + "num_gpu_blocks": 2, + ... +} +``` + +Setting `max_num_batched_tokens = max_model_len` causes nanovllm to process all tokens at once. + +## Potential Solutions + +### Option 1: Chunked MLP Processing + +Modify `run_layerwise_offload_prefill` to process MLP in chunks: + +```python +# Instead of: +hidden_states = layer.mlp(hidden_states) + +# Do: +chunk_size = 8192 # Process 8K tokens at a time +chunks = hidden_states.split(chunk_size, dim=0) +outputs = [] +for chunk in chunks: + outputs.append(layer.mlp(chunk)) +hidden_states = torch.cat(outputs, dim=0) +``` + +### Option 2: Activation Checkpointing + +Use gradient checkpointing to recompute activations instead of storing them: + +```python +from torch.utils.checkpoint import checkpoint +hidden_states = checkpoint(layer.mlp, hidden_states, use_reentrant=False) +``` + +### Option 3: Reduce Chunk Size via Config + +Add a new config parameter `prefill_chunk_size` to control how many tokens are processed per forward pass. + +## Memory Estimation Formula + +For a given sequence length `S` and model config: + +``` +MLP_peak_memory = S × intermediate_size × 2 × 2 bytes + = S × 14336 × 4 bytes + +For S = 65536: +MLP_peak = 65536 × 14336 × 4 = 3.76 GB +``` + +Maximum safe sequence length for RTX 3090 (24GB): +``` +S_max = available_memory / (intermediate_size × 4) + = 6GB / (14336 × 4) + ≈ 100K tokens (theoretical) + ≈ 8-16K tokens (practical, with safety margin) +``` + +## Reproduction Steps + +```bash +cd /home/zijie/Code/COMPASS/eval/RULER/scripts + +# Set SEQ_LENGTHS to 65536 in config_models.sh +# Then run: +./run.sh llama3.1-8b-nanovllm synthetic --metric full --task niah_single_1 +``` + +## Related Files + +- `nanovllm/engine/model_runner.py`: `run_layerwise_offload_prefill()` (line 751+) +- `nanovllm/models/llama.py`: `LlamaMLP.forward()` (line 103) +- `nanovllm/config.py`: Config parameters +- RULER integration: `eval/RULER/scripts/pred/model_wrappers.py` diff --git a/tests/test_ruler.py b/tests/test_ruler.py index a386c56..7dcc7dc 100644 --- a/tests/test_ruler.py +++ b/tests/test_ruler.py @@ -38,11 +38,11 @@ from nanovllm import LLM, SamplingParams # Constants # ============================================================ -DEFAULT_DATA_DIR = Path(__file__).parent / "data/ruler_32k" +DEFAULT_DATA_DIR = Path(__file__).parent / "data/ruler_64k" DEFAULT_MODEL = os.path.expanduser("~/models/Llama-3.1-8B-Instruct") # Note: max_model_len must be > max_input_len to leave room for output tokens -# 32k benchmark has inputs up to 32760 tokens, so we need 32768 + 128 = 32896 -DEFAULT_MAX_MODEL_LEN = 32896 +# 64k benchmark has inputs up to 65536 tokens, so we need 65536 + 128 = 65664 +DEFAULT_MAX_MODEL_LEN = 65664 DEFAULT_MAX_NEW_TOKENS = 128 # Larger for multi-value tasks # Task categories for evaluation @@ -222,6 +222,7 @@ def run_ruler_benchmark( enable_cpu_offload: bool = False, num_gpu_blocks: int = 4, block_size: int = 1024, + num_kv_buffers: int = 4, gpu_utilization: float = 0.9, enforce_eager: bool = True, verbose: bool = True, @@ -270,6 +271,7 @@ def run_ruler_benchmark( } if enable_cpu_offload: llm_kwargs["num_gpu_blocks"] = num_gpu_blocks + llm_kwargs["num_kv_buffers"] = num_kv_buffers llm = LLM(model_path, **llm_kwargs) @@ -356,6 +358,8 @@ if __name__ == "__main__": help="Number of GPU blocks for CPU offload (default: 4)") parser.add_argument("--block-size", type=int, default=1024, help="KV cache block size (default: 1024)") + parser.add_argument("--num-kv-buffers", type=int, default=4, + help="Number of KV buffers for ring buffer (default: 4)") parser.add_argument("--gpu-utilization", type=float, default=0.9, help="GPU memory utilization (default: 0.9)") parser.add_argument("--use-cuda-graph", action="store_true", @@ -379,6 +383,7 @@ if __name__ == "__main__": enable_cpu_offload=args.enable_offload, num_gpu_blocks=args.num_gpu_blocks, block_size=args.block_size, + num_kv_buffers=args.num_kv_buffers, gpu_utilization=args.gpu_utilization, enforce_eager=not args.use_cuda_graph, verbose=not args.quiet,