From 690456dbf96f88e56e83120c9e2abafe2b3850d0 Mon Sep 17 00:00:00 2001 From: Zijie Tian Date: Tue, 20 Jan 2026 02:50:14 +0800 Subject: [PATCH] =?UTF-8?q?=E2=99=BB=EF=B8=8F=20refactor:=20create=20ops?= =?UTF-8?q?=20module=20and=20move=20chunked=5Fattention?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Create nanovllm/ops/ module for low-level attention operators - Move chunked_attention.py from kvcache/ to ops/ - Update imports in full_policy.py (3 locations) - Fix: remove dead code in OffloadEngine.reset() referencing non-existent layer_k/v_buffer_a/b attributes Verified with needle test (32K offload): PASSED Co-Authored-By: Claude Opus 4.5 --- nanovllm/kvcache/offload_engine.py | 7 ------- nanovllm/kvcache/sparse/full_policy.py | 6 +++--- nanovllm/ops/__init__.py | 19 +++++++++++++++++++ .../{kvcache => ops}/chunked_attention.py | 0 4 files changed, 22 insertions(+), 10 deletions(-) create mode 100644 nanovllm/ops/__init__.py rename nanovllm/{kvcache => ops}/chunked_attention.py (100%) diff --git a/nanovllm/kvcache/offload_engine.py b/nanovllm/kvcache/offload_engine.py index 5e4423e..64142ba 100644 --- a/nanovllm/kvcache/offload_engine.py +++ b/nanovllm/kvcache/offload_engine.py @@ -255,7 +255,6 @@ class OffloadEngine: Clears: - GPU ring buffer slots (k_cache_gpu, v_cache_gpu) - Per-layer decode buffers (decode_k_buffer, decode_v_buffer) - - Cross-layer pipeline buffers (layer_k/v_buffer_a/b) - Per-layer prefill buffers (prefill_k/v_buffer) - All pending async transfer events """ @@ -267,12 +266,6 @@ class OffloadEngine: self.decode_k_buffer.zero_() self.decode_v_buffer.zero_() - # Clear cross-layer pipeline buffers - self.layer_k_buffer_a.zero_() - self.layer_v_buffer_a.zero_() - self.layer_k_buffer_b.zero_() - self.layer_v_buffer_b.zero_() - # Clear per-layer prefill buffers self.prefill_k_buffer.zero_() self.prefill_v_buffer.zero_() diff --git a/nanovllm/kvcache/sparse/full_policy.py b/nanovllm/kvcache/sparse/full_policy.py index 514842f..ff99133 100644 --- a/nanovllm/kvcache/sparse/full_policy.py +++ b/nanovllm/kvcache/sparse/full_policy.py @@ -84,7 +84,7 @@ class FullAttentionPolicy(SparsePolicy): Returns: Attention output [seq_len, num_heads, head_dim] """ - from nanovllm.kvcache.chunked_attention import flash_attn_with_lse, merge_attention_outputs + from nanovllm.ops.chunked_attention import flash_attn_with_lse, merge_attention_outputs logger.debug(f"[DEBUG] FullPolicy.compute_chunked_prefill called, " f"layer={layer_id}, chunk={current_chunk_idx}, num_tokens={num_tokens}") @@ -222,7 +222,7 @@ class FullAttentionPolicy(SparsePolicy): Returns: Attention output [batch_size, 1, num_heads, head_dim] """ - from nanovllm.kvcache.chunked_attention import flash_attn_with_lse, merge_attention_outputs + from nanovllm.ops.chunked_attention import flash_attn_with_lse, merge_attention_outputs # q shape: [batch_size, num_heads, head_dim] (single decode token per sequence) q_batched = q.unsqueeze(1) # [batch, 1, heads, dim] @@ -319,7 +319,7 @@ class FullAttentionPolicy(SparsePolicy): Loads one block at a time, computes attention, and merges results. Uses load_to_slot_layer / wait_slot_layer / get_kv_for_slot methods. """ - from nanovllm.kvcache.chunked_attention import flash_attn_with_lse, merge_attention_outputs + from nanovllm.ops.chunked_attention import flash_attn_with_lse, merge_attention_outputs num_blocks = len(cpu_block_table) if num_blocks == 0: diff --git a/nanovllm/ops/__init__.py b/nanovllm/ops/__init__.py new file mode 100644 index 0000000..171cd29 --- /dev/null +++ b/nanovllm/ops/__init__.py @@ -0,0 +1,19 @@ +""" +Operators module for nano-vLLM. + +This module contains low-level attention operators and kernels. +""" + +from nanovllm.ops.chunked_attention import ( + flash_attn_with_lse, + merge_attention_outputs, + chunked_attention_varlen, + ChunkedPrefillState, +) + +__all__ = [ + "flash_attn_with_lse", + "merge_attention_outputs", + "chunked_attention_varlen", + "ChunkedPrefillState", +] diff --git a/nanovllm/kvcache/chunked_attention.py b/nanovllm/ops/chunked_attention.py similarity index 100% rename from nanovllm/kvcache/chunked_attention.py rename to nanovllm/ops/chunked_attention.py