[feat] Added chunked prefill and kvcache offload mechenism.

2025-12-10 03:47:37 +08:00
parent 204fe2b38f
commit 0b6f19242d
25 changed files with 4414 additions and 61 deletions
--- a/nanovllm/layers/attention.py
+++ b/nanovllm/layers/attention.py
@@ -55,21 +55,164 @@ class Attention(nn.Module):
        self.scale = scale
        self.num_kv_heads = num_kv_heads
        self.k_cache = self.v_cache = torch.tensor([])
+        # Layer ID set by model_runner after model creation
+        self.layer_id: int = -1

    def forward(self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor):
        context = get_context()
        k_cache, v_cache = self.k_cache, self.v_cache
        if k_cache.numel() and v_cache.numel():
            store_kvcache(k, v, k_cache, v_cache, context.slot_mapping)
+
        if context.is_prefill:
-            if context.block_tables is not None:    # prefix cache
+            if context.is_chunked_prefill:
+                # Chunked prefill: merge attention from previous KV
+                o = self._chunked_prefill_attention(q, k, v, context)
+            elif context.block_tables is not None:    # prefix cache
                k, v = k_cache, v_cache
-            o = flash_attn_varlen_func(q, k, v,
-                                       max_seqlen_q=context.max_seqlen_q, cu_seqlens_q=context.cu_seqlens_q,
-                                       max_seqlen_k=context.max_seqlen_k, cu_seqlens_k=context.cu_seqlens_k,
-                                       softmax_scale=self.scale, causal=True, block_table=context.block_tables)
+                o = flash_attn_varlen_func(q, k, v,
+                                           max_seqlen_q=context.max_seqlen_q, cu_seqlens_q=context.cu_seqlens_q,
+                                           max_seqlen_k=context.max_seqlen_k, cu_seqlens_k=context.cu_seqlens_k,
+                                           softmax_scale=self.scale, causal=True, block_table=context.block_tables)
+            else:
+                o = flash_attn_varlen_func(q, k, v,
+                                           max_seqlen_q=context.max_seqlen_q, cu_seqlens_q=context.cu_seqlens_q,
+                                           max_seqlen_k=context.max_seqlen_k, cu_seqlens_k=context.cu_seqlens_k,
+                                           softmax_scale=self.scale, causal=True, block_table=context.block_tables)
        else:    # decode
-            o = flash_attn_with_kvcache(q.unsqueeze(1), k_cache, v_cache,
-                                        cache_seqlens=context.context_lens, block_table=context.block_tables, 
-                                        softmax_scale=self.scale, causal=True)
+            if context.is_chunked_prefill:
+                # Chunked decode: need to load all KV from CPU+GPU
+                o = self._chunked_decode_attention(q, k, v, context)
+            else:
+                o = flash_attn_with_kvcache(q.unsqueeze(1), k_cache, v_cache,
+                                            cache_seqlens=context.context_lens, block_table=context.block_tables,
+                                            softmax_scale=self.scale, causal=True)
        return o
+
+    def _chunked_prefill_attention(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        context,
+    ) -> torch.Tensor:
+        """
+        Compute attention with chunked KV from CPU cache.
+
+        For chunked prefill:
+        1. Load previous KV from CPU for this layer
+        2. Compute attention against previous KV (no causal mask)
+        3. Compute attention against current chunk's KV (causal)
+        4. Merge results using online softmax
+        """
+        from nanovllm.kvcache.chunked_attention import flash_attn_with_lse, merge_attention_outputs
+
+        # q, k, v shape: [total_tokens, num_heads, head_dim]
+        total_tokens = q.shape[0]
+
+        # Reshape for flash attention: [batch, seq, heads, dim]
+        q_batched = q.unsqueeze(0)  # [1, total_tokens, heads, dim]
+        k_batched = k.unsqueeze(0)
+        v_batched = v.unsqueeze(0)
+
+        accumulated_o = None
+        accumulated_lse = None
+
+        # Load previous KV from CPU for this layer
+        if context.offload_engine is not None and self.layer_id >= 0:
+            # Get the kvcache_manager from context
+            kvcache_manager = context.offload_engine
+
+            # For each sequence in the chunk, load previous KV
+            # Currently assuming single sequence
+            if hasattr(context, 'chunked_seq') and context.chunked_seq is not None:
+                prev_k, prev_v = kvcache_manager.load_prev_kv_for_layer(
+                    context.chunked_seq,
+                    self.layer_id,
+                )
+
+                if prev_k is not None and prev_v is not None:
+                    # Compute attention against previous KV (no causal mask)
+                    prev_o, prev_lse = flash_attn_with_lse(
+                        q_batched,
+                        prev_k,
+                        prev_v,
+                        softmax_scale=self.scale,
+                        causal=False,  # No causal mask for previous context
+                    )
+                    accumulated_o = prev_o
+                    accumulated_lse = prev_lse
+
+        # Compute attention against current chunk's KV (with causal mask)
+        current_o, current_lse = flash_attn_with_lse(
+            q_batched,
+            k_batched,
+            v_batched,
+            softmax_scale=self.scale,
+            causal=True,  # Causal mask for current chunk
+        )
+
+        # Merge with accumulated
+        if accumulated_o is None:
+            final_o = current_o
+        else:
+            final_o, _ = merge_attention_outputs(
+                accumulated_o, accumulated_lse,
+                current_o, current_lse,
+            )
+
+        # Remove batch dimension: [1, total_tokens, heads, dim] -> [total_tokens, heads, dim]
+        return final_o.squeeze(0)
+
+    def _chunked_decode_attention(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        context,
+    ) -> torch.Tensor:
+        """
+        Compute decode attention with KV spread across CPU and GPU.
+
+        For decode with chunked KV:
+        1. Load all KV for this layer from CPU+GPU
+        2. Compute attention (1 query token vs all KV)
+        3. Return output
+        """
+        from nanovllm.kvcache.chunked_attention import flash_attn_with_lse
+
+        # q shape: [batch_size, num_heads, head_dim] (single decode token per sequence)
+        # We need to attend to ALL previous tokens
+
+        # Load all KV for this layer
+        if context.offload_engine is not None and self.layer_id >= 0:
+            kvcache_manager = context.offload_engine
+
+            if hasattr(context, 'chunked_seq') and context.chunked_seq is not None:
+                # Load all KV from both GPU and CPU for this layer
+                k_all, v_all = kvcache_manager.load_all_kv_for_layer(
+                    context.chunked_seq,
+                    self.layer_id,
+                )
+
+                if k_all is not None and v_all is not None:
+                    # q shape: [batch_size, num_heads, head_dim]
+                    # Need: [batch, seqlen, heads, dim]
+                    # Insert seqlen dimension at position 1
+                    q_batched = q.unsqueeze(1)  # [batch, 1, heads, dim]
+
+                    # k_all, v_all shape: [1, total_kv_tokens, kv_heads, head_dim]
+                    # Compute attention (no causal mask for decode - we want all KV)
+                    out, _ = flash_attn_with_lse(
+                        q_batched,
+                        k_all,
+                        v_all,
+                        softmax_scale=self.scale,
+                        causal=False,  # No causal mask for decode
+                    )
+
+                    # Output shape: [batch, 1, heads, dim] -> [batch, heads, dim]
+                    return out.squeeze(1)
+
+        # Fallback: shouldn't reach here
+        raise RuntimeError("Chunked decode attention failed: no KV available")