From 86633004cae45d7d67457b86b9c00a45d1c751ff Mon Sep 17 00:00:00 2001
From: Zijie Tian <zijietian@mail.xmu.edu.cn>
Date: Wed, 14 Jan 2026 07:02:09 +0800
Subject: [PATCH] =?UTF-8?q?=F0=9F=93=9D=20docs:=20add=2064k=20memory=20ana?=
 =?UTF-8?q?lysis=20and=20test=20configuration=20updates?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add comprehensive memory analysis for 64k inference on Llama 3.1 8B:

New documentation:
- docs/64k_memory_analysis.md: GPU-only vs offload memory analysis,
  OOM root cause (memory fragmentation), RTX 3090 limitations,
  theoretical vs actual memory usage breakdown

Test configuration updates:
- tests/test_ruler.py: Add --num-kv-buffers parameter for ring buffer
  size tuning (default 4, can reduce to 1 for lower memory)
- Update default data_dir to ruler_64k
- Update default max_model_len to 65664 for 64k support

CLAUDE.md updates:
- Add 64k_memory_analysis.md to documentation index
- Document num_kv_buffers parameter in Configuration section
- Add 64k hardware requirements note to Model Limits

Key findings: 64k inference requires ~26GB (GPU-only) or ~23GB (offload)
due to memory fragmentation on 24GB GPUs, making A100 (40GB+) the
recommended hardware for 64k workloads.

Co-Authored-By: Claude <noreply@anthropic.com>
---
 CLAUDE.md                      |   4 +-
 docs/64k_memory_analysis.md    | 131 +++++++++++++++++++++++++++
 docs/64k_mlp_activation_oom.md | 161 +++++++++++++++++++++++++++++++++
 tests/test_ruler.py            |  11 ++-
 4 files changed, 303 insertions(+), 4 deletions(-)
 create mode 100644 docs/64k_memory_analysis.md
 create mode 100644 docs/64k_mlp_activation_oom.md

diff --git a/CLAUDE.md b/CLAUDE.md
index b54b9ff..8559748 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -59,6 +59,7 @@ PYTHONPATH=/home/zijie/Code/nano-vllm:$PYTHONPATH python tests/test_needle.py
 | [`docs/debugging_guide.md`](docs/debugging_guide.md) | PyTorch hooks for debugging, tensor comparison, memory profiling |
 | [`docs/gpu_only_performance_issue.md`](docs/gpu_only_performance_issue.md) | GPU-only mode slower than offload due to PagedAttention scatter overhead, optimization proposals |
 | [`docs/offload_accuracy_issue.md`](docs/offload_accuracy_issue.md) | **BUG**: CPU offload mode 66% accuracy vs 100% non-offload on RULER NIAH benchmark |
+| [`docs/64k_memory_analysis.md`](docs/64k_memory_analysis.md) | 64k inference memory analysis: GPU-only vs offload, OOM root cause (fragmentation), RTX 3090 limitations |
 
 ## Configuration
 
@@ -69,7 +70,7 @@ PYTHONPATH=/home/zijie/Code/nano-vllm:$PYTHONPATH python tests/test_needle.py
 | `gpu_memory_utilization` | 0.9 | GPU memory fraction |
 | `enable_cpu_offload` | False | Enable for long context |
 | `num_gpu_blocks` | 2 | GPU blocks for offload mode |
-| `num_kv_buffers` | 4 | Ring buffer size for decode pipeline |
+| `num_kv_buffers` | 4 | Ring buffer size (1-4), lower = less memory but slower decode |
 | `enforce_eager` | False | Set True to disable CUDA graphs |
 
 ## Benchmarking
@@ -85,6 +86,7 @@ PYTHONPATH=/home/zijie/Code/nano-vllm:$PYTHONPATH python tests/test_needle.py
 - Qwen3-0.6B/4B: 40960 tokens
 - Qwen2.5-7B-Instruct-1M: 1048576 tokens
 - Llama-3.1-8B-Instruct: 131072 tokens
+- **64k on RTX 3090/4090 (24GB)**: Requires CPU offload + optimizations, see [`docs/64k_memory_analysis.md`](docs/64k_memory_analysis.md)
 
 **Performance (Qwen3-4B, CPU Offload)**:
 - Prefill: ~5700-8000 tok/s (varies by context length)
diff --git a/docs/64k_memory_analysis.md b/docs/64k_memory_analysis.md
new file mode 100644
index 0000000..0e710aa
--- /dev/null
+++ b/docs/64k_memory_analysis.md
@@ -0,0 +1,131 @@
+# 64k 推理内存分析
+
+本文档分析 Llama 3.1 8B 模型在 64k 长度推理时的内存占用，以及 RTX 3090 (24GB) 上的 OOM 问题。
+
+## 模型配置
+
+```python
+hidden_size = 4096
+intermediate_size = 14336
+num_layers = 32
+num_heads = 32
+num_kv_heads = 8
+head_dim = 128
+seq_len = 65536
+dtype = bfloat16 (2 bytes)
+```
+
+## 理论内存占用
+
+### GPU Only 模式
+
+| 组件 | 计算公式 | 内存占用 |
+|------|----------|----------|
+| 模型权重 | 8.03B × 2 bytes | **16.06 GB** |
+| KV Cache | 32 × 65536 × 8 × 128 × 2 × 2 | **8.19 GB** |
+| Prefill 激活值峰值 | max(QKV, MLP) | **~2 GB** |
+| **总计** | | **~26 GB** |
+
+**结论**：GPU only 模式需要 ~26 GB，**RTX 3090 (24GB) 无法运行**。
+
+### CPU Offload 模式
+
+| 组件 | 计算公式 | 内存占用 |
+|------|----------|----------|
+| 模型权重 | 8.03B × 2 bytes | **16.06 GB** |
+| Ring buffer | num_kv_buffers × seq_len × 128 KB/token | 258-1034 MB |
+| GPU KV blocks | num_gpu_blocks × block_size × 128 KB/token | 256 MB (2 blocks) |
+| Per-layer decode buffer | 32 layers × 缓冲 | 128 MB |
+| 激活值峰值 (chunked) | chunk_size × hidden_size × 2 | ~50 MB |
+| PyTorch 开销 | CUDA 上下文 + 碎片 | ~5-6 GB |
+| **理论小计** | | **~17.5 GB** |
+| **实际需求** | | **~23 GB** |
+
+**配置参数**：
+- `num_kv_buffers`: Ring buffer 大小 (1-4)，默认 4
+- `num_gpu_blocks`: GPU 上的 KV cache block 数量
+- `block_size`: 每个 block 的 token 数
+
+## OOM 问题分析
+
+### 实际观测（RTX 3090, num_kv_buffers=1）
+
+```
+PyTorch allocated:     22.49 GB
+PyTorch reserved:      429 MB
+Free:                  306 MB
+Total available:       735 MB
+Failed to allocate:    508 MB (torch.cat)
+```
+
+### 内存碎片来源
+
+| 来源 | 说明 | 影响 |
+|------|------|------|
+| Binned 分配器 | PyTorch 使用固定大小的内存池 | 中等 |
+| torch.compile 缓存 | 编译后的 kernel 代码和常量 | 高 (~2-3 GB) |
+| 频繁分配/释放 | chunked 处理中每个 chunk 的创建销毁 | 高 |
+| 不同大小张量 | (128,4096), (65536,6144) 等 | 中等 |
+
+### torch.cat 内存需求
+
+Chunked MLP 处理（chunk_size=128）：
+```
+65536 / 128 = 512 chunks
+每个 chunk 输出: (128, 4096) × 2 bytes = 1 MB
+torch.cat 拼接需要: (65536, 4096) × 2 bytes = 508 MB (连续)
+```
+
+## 已尝试的优化
+
+| 优化项 | 效果 |
+|--------|------|
+| 移除 `@torch.compile` | PyTorch: 23.13 → 22.80 GB (-300 MB) |
+| 减少 `num_kv_buffers` (4→1) | Ring buffer: 1034 → 258 MB (-776 MB) |
+| Chunked QKV/MLP/LayerNorm | 峰值激活: ~2 GB → ~50 MB |
+| 降低 GPU 利用率 (0.9→0.75) | 无明显效果 |
+| 减小 chunk_size (4096→128) | 峰值降低，但 torch.cat 需要连续内存 |
+
+### 最终状态
+
+```
+理论需求:    ~17.5 GB
+实际分配:    22.49 GB
+剩余空间:    735 MB (306 MB + 429 MB reserved)
+分配失败:    508 MB (torch.cat 需要连续内存)
+```
+
+## 结论
+
+### 根本原因
+
+**不是绝对内存不足，而是内存碎片导致的分配失败**。
+
+理论需求 17.5 GB < 24 GB，但由于：
+- PyTorch 开销（CUDA 上下文、碎片）：~5-6 GB
+- torch.compile 缓存：~2-3 GB（已移除）
+- 内存碎片导致无法分配 508 MB 连续块
+
+### 硬件限制
+
+| GPU | 显存 | 64k GPU Only | 64k Offload |
+|-----|------|--------------|--------------|
+| RTX 3090 | 24 GB | ❌ | ⚠️ 碎片问题 |
+| RTX 4090 | 24 GB | ❌ | ⚠️ 碎片问题 |
+| A100 | 40 GB | ✅ | ✅ |
+| A100 | 80 GB | ✅ | ✅ |
+
+### 建议
+
+1. **64k 推理建议使用 40GB+ 显存的 GPU**
+2. RTX 3090/4090 适合 32k 或更短的场景
+3. 如必须在 24GB GPU 上运行 64k：
+   - 使用 RAPIDS RMM 分配器
+   - 预分配 torch.cat 需要的内存
+   - 或使用流式处理避免 torch.cat
+
+## 参考
+
+- [PyTorch 内存管理文档](https://docs.pytorch.org/docs/stable/generated/torch.cuda.memory.memory_stats.html)
+- [PyTorch 内存碎片讨论](https://discuss.pytorch.org/t/how-to-reduce-memory-fragmentation-when-enable-expandable-segments/221805)
+- [STWeaver - 减少 79% 内存碎片](https://arxiv.org/html/2507.16274v1)
diff --git a/docs/64k_mlp_activation_oom.md b/docs/64k_mlp_activation_oom.md
new file mode 100644
index 0000000..6b55118
--- /dev/null
+++ b/docs/64k_mlp_activation_oom.md
@@ -0,0 +1,161 @@
+# 64K Prefill MLP Activation OOM Issue
+
+## Problem Summary
+
+When running RULER benchmark with 64K context length using CPU offload mode, OOM occurs during MLP forward pass in `run_layerwise_offload_prefill`. The KV cache is successfully offloaded to CPU, but MLP intermediate activations exceed available GPU memory.
+
+## Environment
+
+- GPU: RTX 3090 (24GB)
+- Model: LLaMA 3.1 8B
+- Sequence Length: 65536 tokens
+- Mode: `enable_cpu_offload=True`, `num_gpu_blocks=2`
+
+## Error Message
+
+```
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 3.47 GiB.
+GPU 0 has a total capacity of 23.57 GiB of which 2.66 GiB is free.
+Including non-PyTorch memory, this process has 20.88 GiB memory in use.
+Of the allocated memory 20.51 GiB is allocated by PyTorch, and 32.26 MiB
+is reserved by PyTorch but unallocated.
+```
+
+## Stack Trace
+
+```
+File "nanovllm/engine/model_runner.py", line 843, in run_layerwise_offload_prefill
+    hidden_states = layer.mlp(hidden_states)
+  File "nanovllm/models/llama.py", line 103, in forward
+    gate_up = self.gate_up_proj(x)
+  File "nanovllm/layers/linear.py", line 73, in forward
+    return F.linear(x, self.weight, self.bias)
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 3.47 GiB.
+```
+
+## Root Cause Analysis
+
+### Memory Breakdown
+
+| Component | Calculation | Size |
+|-----------|-------------|------|
+| Model weights (BF16) | 8B params × 2 bytes | ~16 GB |
+| GPU KV cache | 2 blocks × 1024 tokens × 8KB/token | ~16 MB |
+| **Remaining for activations** | 24 - 16 - overhead | **~6-7 GB** |
+
+### MLP Activation Memory (per layer)
+
+For LLaMA 3.1 8B with `hidden_size=4096`, `intermediate_size=14336`:
+
+| Tensor | Shape | Size (BF16) |
+|--------|-------|-------------|
+| MLP input | [65536, 4096] | 512 MB |
+| gate_up output | [65536, 28672] | **3.47 GB** |
+| down_proj input | [65536, 14336] | 1.75 GB |
+| MLP output | [65536, 4096] | 512 MB |
+
+**Peak MLP memory**: ~3.5-4 GB for intermediate tensors
+
+### Why OOM Occurs
+
+1. Model weights consume ~16 GB (loaded on GPU for layer-wise processing)
+2. Available memory: ~7 GB
+3. MLP `gate_up_proj` output: 3.47 GB
+4. Additional tensors (input, gradients, etc.): ~1-2 GB
+5. **Total required > Available** → OOM
+
+## Code Location
+
+The issue is in `nanovllm/engine/model_runner.py`:
+
+```python
+# Line 843 in run_layerwise_offload_prefill
+hidden_states = layer.mlp(hidden_states)  # <-- OOM here
+```
+
+The entire sequence (65536 tokens) is passed through MLP in one shot.
+
+## Current Configuration
+
+From `model_wrappers.py` (RULER integration):
+
+```python
+llm_kwargs = {
+    "max_model_len": max_model_len,           # 128 * 1024
+    "max_num_batched_tokens": max_model_len,  # Same as max_model_len
+    "enable_cpu_offload": True,
+    "num_gpu_blocks": 2,
+    ...
+}
+```
+
+Setting `max_num_batched_tokens = max_model_len` causes nanovllm to process all tokens at once.
+
+## Potential Solutions
+
+### Option 1: Chunked MLP Processing
+
+Modify `run_layerwise_offload_prefill` to process MLP in chunks:
+
+```python
+# Instead of:
+hidden_states = layer.mlp(hidden_states)
+
+# Do:
+chunk_size = 8192  # Process 8K tokens at a time
+chunks = hidden_states.split(chunk_size, dim=0)
+outputs = []
+for chunk in chunks:
+    outputs.append(layer.mlp(chunk))
+hidden_states = torch.cat(outputs, dim=0)
+```
+
+### Option 2: Activation Checkpointing
+
+Use gradient checkpointing to recompute activations instead of storing them:
+
+```python
+from torch.utils.checkpoint import checkpoint
+hidden_states = checkpoint(layer.mlp, hidden_states, use_reentrant=False)
+```
+
+### Option 3: Reduce Chunk Size via Config
+
+Add a new config parameter `prefill_chunk_size` to control how many tokens are processed per forward pass.
+
+## Memory Estimation Formula
+
+For a given sequence length `S` and model config:
+
+```
+MLP_peak_memory = S × intermediate_size × 2 × 2 bytes
+                = S × 14336 × 4 bytes
+
+For S = 65536:
+MLP_peak = 65536 × 14336 × 4 = 3.76 GB
+```
+
+Maximum safe sequence length for RTX 3090 (24GB):
+```
+S_max = available_memory / (intermediate_size × 4)
+      = 6GB / (14336 × 4)
+      ≈ 100K tokens (theoretical)
+      ≈ 8-16K tokens (practical, with safety margin)
+```
+
+## Reproduction Steps
+
+```bash
+cd /home/zijie/Code/COMPASS/eval/RULER/scripts
+
+# Set SEQ_LENGTHS to 65536 in config_models.sh
+# Then run:
+./run.sh llama3.1-8b-nanovllm synthetic --metric full --task niah_single_1
+```
+
+## Related Files
+
+- `nanovllm/engine/model_runner.py`: `run_layerwise_offload_prefill()` (line 751+)
+- `nanovllm/models/llama.py`: `LlamaMLP.forward()` (line 103)
+- `nanovllm/config.py`: Config parameters
+- RULER integration: `eval/RULER/scripts/pred/model_wrappers.py`
diff --git a/tests/test_ruler.py b/tests/test_ruler.py
index a386c56..7dcc7dc 100644
--- a/tests/test_ruler.py
+++ b/tests/test_ruler.py
@@ -38,11 +38,11 @@ from nanovllm import LLM, SamplingParams
 # Constants
 # ============================================================
 
-DEFAULT_DATA_DIR = Path(__file__).parent / "data/ruler_32k"
+DEFAULT_DATA_DIR = Path(__file__).parent / "data/ruler_64k"
 DEFAULT_MODEL = os.path.expanduser("~/models/Llama-3.1-8B-Instruct")
 # Note: max_model_len must be > max_input_len to leave room for output tokens
-# 32k benchmark has inputs up to 32760 tokens, so we need 32768 + 128 = 32896
-DEFAULT_MAX_MODEL_LEN = 32896
+# 64k benchmark has inputs up to 65536 tokens, so we need 65536 + 128 = 65664
+DEFAULT_MAX_MODEL_LEN = 65664
 DEFAULT_MAX_NEW_TOKENS = 128  # Larger for multi-value tasks
 
 # Task categories for evaluation
@@ -222,6 +222,7 @@ def run_ruler_benchmark(
     enable_cpu_offload: bool = False,
     num_gpu_blocks: int = 4,
     block_size: int = 1024,
+    num_kv_buffers: int = 4,
     gpu_utilization: float = 0.9,
     enforce_eager: bool = True,
     verbose: bool = True,
@@ -270,6 +271,7 @@ def run_ruler_benchmark(
     }
     if enable_cpu_offload:
         llm_kwargs["num_gpu_blocks"] = num_gpu_blocks
+        llm_kwargs["num_kv_buffers"] = num_kv_buffers
 
     llm = LLM(model_path, **llm_kwargs)
 
@@ -356,6 +358,8 @@ if __name__ == "__main__":
                         help="Number of GPU blocks for CPU offload (default: 4)")
     parser.add_argument("--block-size", type=int, default=1024,
                         help="KV cache block size (default: 1024)")
+    parser.add_argument("--num-kv-buffers", type=int, default=4,
+                        help="Number of KV buffers for ring buffer (default: 4)")
     parser.add_argument("--gpu-utilization", type=float, default=0.9,
                         help="GPU memory utilization (default: 0.9)")
     parser.add_argument("--use-cuda-graph", action="store_true",
@@ -379,6 +383,7 @@ if __name__ == "__main__":
         enable_cpu_offload=args.enable_offload,
         num_gpu_blocks=args.num_gpu_blocks,
         block_size=args.block_size,
+        num_kv_buffers=args.num_kv_buffers,
         gpu_utilization=args.gpu_utilization,
         enforce_eager=not args.use_cuda_graph,
         verbose=not args.quiet,