From 79c4df4a27dcf1f59517626a056631a7521f9e50 Mon Sep 17 00:00:00 2001 From: Zijie Tian Date: Thu, 8 Jan 2026 23:42:30 +0800 Subject: [PATCH] [claudesquad] update from 'int-minference-1' on 08 Jan 26 23:42 CST --- CLAUDE.md | 1 + docs/sparse_offload_integration.md | 386 +++++++++++++++++++++++++++++ 2 files changed, 387 insertions(+) create mode 100644 docs/sparse_offload_integration.md diff --git a/CLAUDE.md b/CLAUDE.md index 59ab656..e90185f 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -61,6 +61,7 @@ PYTHONPATH=/home/zijie/Code/nano-vllm:$PYTHONPATH python tests/test_needle.py |----------|---------| | [`docs/architecture_guide.md`](docs/architecture_guide.md) | Core components, layer-wise CPU offload design, prefill/decode flows, implementation details | | [`docs/sparse_attention_guide.md`](docs/sparse_attention_guide.md) | Block sparse attention methods (MInference, FlexPrefill, XAttention, Quest), computation flow | +| [`docs/sparse_offload_integration.md`](docs/sparse_offload_integration.md) | Sparse policy integration with layerwise offload, `requires_block_selection` interface design | | [`docs/layerwise_offload_memory_analysis.md`](docs/layerwise_offload_memory_analysis.md) | Memory allocation analysis with theoretical formulas and empirical validation (< 5% error) | | [`docs/debugging_guide.md`](docs/debugging_guide.md) | PyTorch hooks for debugging, tensor comparison, memory profiling | | [`docs/gpu_only_performance_issue.md`](docs/gpu_only_performance_issue.md) | GPU-only mode slower than offload due to PagedAttention scatter overhead, optimization proposals | diff --git a/docs/sparse_offload_integration.md b/docs/sparse_offload_integration.md new file mode 100644 index 0000000..43b8566 --- /dev/null +++ b/docs/sparse_offload_integration.md @@ -0,0 +1,386 @@ +# Sparse Policy Integration with Layerwise Offload + +This document describes the architecture and design of integrating sparse attention policies (MInference, Quest) with the layerwise CPU offload execution path. + +## Design Goals + +1. **Extend sparse policies to offload path**: GPU-only path already supports sparse policies, but layerwise offload bypasses them +2. **Maintain encapsulation**: All `copy_()` operations must be inside OffloadEngine, not exposed to model_runner +3. **Distinguish policy types**: Some policies affect attention computation (MInference), others affect KV load strategy (Quest) +4. **Extensible architecture**: Easy to add new sparse policies in the future + +## Key Insight + +The existing sparse policy implementation works, but the layerwise offload path bypasses it: + +| Path | Attention Method | Sparse Support | +|------|------------------|----------------| +| GPU-only | `attention.py` → `sparse_prefill_attention()` | YES | +| Layerwise offload | `model_runner.py` → `flash_attn_varlen_func()` | NO (direct call) | + +## Two Types of Sparse Policies + +The fundamental difference between sparse policies: + +| Policy | Affects Attention Computation | Affects KV Load Strategy | `select_blocks()` Behavior | +|--------|------------------------------|--------------------------|---------------------------| +| **MInference** | YES (`sparse_prefill_attention`) | NO | `return available_blocks` (all) | +| **Quest** | NO | YES | Returns Top-K subset | + +- **MInference**: Only changes how attention is computed, doesn't affect external load/offload flow +- **Quest**: Selectively loads only some blocks, affects H2D transfer + +## The `requires_block_selection` Interface Flag + +To distinguish these policy types, we add a flag to the base class: + +```python +# nanovllm/kvcache/sparse/policy.py +class SparsePolicy(ABC): + # Existing flags + supports_prefill: bool = True + supports_decode: bool = True + + # NEW: Whether this policy requires selective block loading + # If True: OffloadEngine will call select_blocks() before loading + # If False: OffloadEngine will load all blocks (select_blocks ignored) + requires_block_selection: bool = False +``` + +### Policy Implementations + +```python +# MInference: prefill-only, no block selection +class MInferencePolicy(SparsePolicy): + supports_prefill = True + supports_decode = False + requires_block_selection = False # Only affects attention computation + +# Quest: decode-only, requires block selection +class QuestPolicy(SparsePolicy): + supports_prefill = False + supports_decode = True + requires_block_selection = True # Affects KV load strategy + +# Full attention: baseline +class FullAttentionPolicy(SparsePolicy): + supports_prefill = True + supports_decode = True + requires_block_selection = False # Load all blocks +``` + +## OffloadEngine Encapsulation + +All KV cache operations are encapsulated in OffloadEngine. The model_runner never directly accesses internal storage. + +### Prefill: Synchronous Offload with Hooks + +```python +# nanovllm/kvcache/offload_engine.py +def offload_layer_kv_sync( + self, + layer_id: int, + k: Tensor, + v: Tensor, + cpu_block_ids: List[int], + total_tokens: int, +) -> None: + """ + Synchronously offload layer KV to CPU. + Calls sparse policy hooks internally. + """ + for i, cpu_block_id in enumerate(cpu_block_ids): + start = i * self.block_size + end = min(start + self.block_size, total_tokens) + actual_size = end - start + + # Hook: notify sparse policy BEFORE offload (k still on GPU) + if self.sparse_policy is not None: + self.sparse_policy.on_prefill_offload( + cpu_block_id, layer_id, k[start:end], actual_size + ) + + # Synchronous copy to CPU (internal) + self.k_cache_cpu[layer_id, cpu_block_id, :actual_size].copy_(k[start:end]) + self.v_cache_cpu[layer_id, cpu_block_id, :actual_size].copy_(v[start:end]) +``` + +### Decode: Policy-Driven Block Loading + +```python +def load_layer_kv_to_buffer_with_policy( + self, + buffer_idx: int, + layer_id: int, + cpu_block_ids: List[int], + valid_tokens_per_block: List[int], + query: Optional[Tensor] = None, +) -> int: + """ + Load layer KV to buffer, optionally using sparse policy for block selection. + + Returns: + Total tokens loaded + """ + # Check if policy requires block selection + if (self.sparse_policy is not None and + self.sparse_policy.requires_block_selection and + query is not None): + # Build context + ctx = PolicyContext( + query_chunk_idx=0, + num_query_chunks=1, + layer_id=layer_id, + query=query, + is_prefill=False, + block_size=self.block_size, + ) + # Select blocks using policy + selected_blocks = self.sparse_policy.select_blocks(cpu_block_ids, ctx) + + # Build valid_tokens for selected blocks + block_to_valid = {bid: vt for bid, vt in zip(cpu_block_ids, valid_tokens_per_block)} + selected_valid = [block_to_valid[bid] for bid in selected_blocks] + + return self._load_blocks_to_buffer( + buffer_idx, layer_id, selected_blocks, selected_valid + ) + else: + # Load all blocks (no selection) + return self._load_blocks_to_buffer( + buffer_idx, layer_id, cpu_block_ids, valid_tokens_per_block + ) +``` + +## Prefill Integration (MInference) + +MInference only affects attention computation, not the load/offload flow: + +```python +# nanovllm/engine/model_runner.py - run_layerwise_offload_prefill() +def run_layerwise_offload_prefill(self, seqs): + ... + for layer_id in range(num_layers): + # QKV projection + RoPE + q, k = layer.self_attn.rotary_emb(positions, q, k) + + # Sparse or Full attention + if self.sparse_prefill_policy is not None: + # MInference: only changes attention computation + attn_output = self.sparse_prefill_policy.sparse_prefill_attention( + q, k, v, layer_id + ) + else: + # Full attention using FlashAttention + attn_output = flash_attn_varlen_func(q, k, v, ...) + + # MLP + ... + + # Offload ALL KV (MInference doesn't affect this) + offload_engine.offload_layer_kv_sync(layer_id, k, v, cpu_block_ids, total_tokens) +``` + +### Execution Flow Diagram + +``` +┌─────────────────────────────────────────────────────────────────┐ +│ Layerwise Offload Prefill │ +│ with MInference │ +└─────────────────────────────────────────────────────────────────┘ + +For each layer: +┌──────────────┐ ┌──────────────┐ ┌────────────────────────┐ +│ QKV Proj │───▶│ RoPE │───▶│ sparse_prefill_attn() │ +│ │ │ │ │ (MInference pattern) │ +└──────────────┘ └──────────────┘ └───────────┬────────────┘ + │ + ┌──────────────┐ ┌───────────▼────────────┐ + │ MLP │◀───│ O Projection │ + │ │ │ │ + └──────┬───────┘ └────────────────────────┘ + │ + ┌──────▼───────┐ + │ offload_ │ K, V still on GPU + │ layer_kv_ │───▶ Copy to CPU + │ sync() │ (all blocks) + └──────────────┘ +``` + +## Decode Integration (Quest - Infrastructure Ready) + +Quest affects block load strategy. The infrastructure is ready, full integration deferred. + +```python +# nanovllm/engine/model_runner.py - run_layerwise_offload_decode() +def run_layerwise_offload_decode(self, seqs): + ... + # Preload first N layers (no query available, full load) + for i in range(num_preload): + loaded_tokens[i] = offload_engine.load_layer_kv_to_buffer( + i, i, cpu_block_table, valid_tokens_per_block + ) + + for layer_id in range(num_layers): + current_buffer = layer_id % num_buffers + + # Wait for buffer load + offload_engine.wait_buffer_load(current_buffer) + + # QKV projection + q, k_new, v_new = ... + + # Get loaded KV from ring buffer + k_prefill, v_prefill = offload_engine.get_buffer_kv( + current_buffer, loaded_tokens[current_buffer] + ) + + # Attention + ... + + # Mark buffer done + offload_engine.record_buffer_compute_done(current_buffer) + + # Load next layer + # Future: use load_layer_kv_to_buffer_with_policy(query=q) for Quest + next_layer = layer_id + num_buffers + if next_layer < num_layers: + loaded_tokens[current_buffer] = offload_engine.load_layer_kv_to_buffer( + current_buffer, next_layer, cpu_block_table, valid_tokens_per_block + ) +``` + +### Quest Integration (Future Work) + +When Quest is fully integrated: + +```python +# Load next layer with Quest block selection +if next_layer < num_layers: + loaded_tokens[current_buffer] = offload_engine.load_layer_kv_to_buffer_with_policy( + current_buffer, next_layer, cpu_block_table, valid_tokens_per_block, + query=q # Pass query for block selection + ) +``` + +**Challenge**: First N layers are preloaded before query is available, so they must use full load. + +## Configuration + +### Enabling Sparse Policy + +```python +from nanovllm import LLM +from nanovllm.config import SparsePolicyType + +# GPU-only with MInference +llm = LLM( + model_path, + sparse_policy=SparsePolicyType.MINFERENCE, + minference_adaptive_budget=0.3, # 30% of seq_len +) + +# Offload with MInference +llm = LLM( + model_path, + enable_cpu_offload=True, + num_gpu_blocks=2, + sparse_policy=SparsePolicyType.MINFERENCE, + minference_adaptive_budget=0.3, +) +``` + +### MInference Parameters + +| Parameter | Default | Description | +|-----------|---------|-------------| +| `minference_adaptive_budget` | 0.3 | Budget as fraction of seq_len (0.3 = 30%) | +| `minference_vertical_size` | 1000 | Fixed vertical size (when budget=None) | +| `minference_slash_size` | 6096 | Fixed slash size (when budget=None) | +| `minference_num_sink_tokens` | 30 | Always-kept initial tokens | +| `minference_num_recent_diags` | 100 | Always-kept recent diagonals | + +### Quest Parameters (for future decode integration) + +| Parameter | Default | Description | +|-----------|---------|-------------| +| `sparse_topk_blocks` | 8 | Top-K blocks to load | +| `sparse_threshold_blocks` | 4 | Apply sparse only when blocks > threshold | + +## Sparse Policy Hooks + +Sparse policies can implement hooks for metadata collection: + +```python +class SparsePolicy(ABC): + def on_prefill_offload( + self, + block_id: int, + layer_id: int, + key: torch.Tensor, + valid_tokens: int, + ) -> None: + """ + Hook called during prefill offload BEFORE KV is copied to CPU. + Key tensor is still on GPU - can compute metadata efficiently. + + Used by Quest to compute min/max key statistics for block selection. + """ + pass + + def on_decode_offload( + self, + block_id: int, + keys: torch.Tensor, # [num_layers, block_size, kv_heads, head_dim] + ) -> None: + """ + Hook called when decode buffer is offloaded to CPU. + """ + pass +``` + +## File Changes Summary + +| File | Changes | +|------|---------| +| `nanovllm/kvcache/sparse/policy.py` | Add `requires_block_selection` attribute | +| `nanovllm/kvcache/sparse/minference.py` | Set `requires_block_selection = False` | +| `nanovllm/kvcache/sparse/quest.py` | Set `requires_block_selection = True` | +| `nanovllm/kvcache/sparse/full_policy.py` | Set `requires_block_selection = False` | +| `nanovllm/kvcache/offload_engine.py` | Add `offload_layer_kv_sync()`, sparse hooks | +| `nanovllm/engine/model_runner.py` | Integrate sparse policies in offload paths | + +## Key Design Principles + +1. **Encapsulation**: All `copy_()` operations inside OffloadEngine +2. **Interface Flag**: `requires_block_selection` declares policy type +3. **Separation of Concerns**: + - MInference: only `sparse_prefill_attention()` (compute-level) + - Quest: `select_blocks()` + hooks (load-level) +4. **Hooks Inside Engine**: Policy hooks called within OffloadEngine methods + +## Test Results + +Verified on Qwen3-4B-Instruct-2507 with 32K input: + +``` +# GPU-only + MInference +test_needle.py --model Qwen3-4B --input-len 32768 --enable-minference +- Prefill: 3383 tok/s +- Output: "7492<|im_end|>" +- Result: PASSED + +# Offload + MInference +test_needle.py --model Qwen3-4B --input-len 32768 --enable-offload --enable-minference +- Prefill: 5373 tok/s +- Output: "7492<|im_end|>" +- Result: PASSED +``` + +Both configurations produce identical outputs, confirming correctness. + +## Related Documents + +- [`sparse_attention_guide.md`](sparse_attention_guide.md): Algorithm details for sparse methods +- [`architecture_guide.md`](architecture_guide.md): Overall system architecture +- [`gpu_only_performance_issue.md`](gpu_only_performance_issue.md): Why offload is faster than GPU-only