From 690456dbf96f88e56e83120c9e2abafe2b3850d0 Mon Sep 17 00:00:00 2001
From: Zijie Tian <zijietian@mail.xmu.edu.cn>
Date: Tue, 20 Jan 2026 02:50:14 +0800
Subject: [PATCH] =?UTF-8?q?=E2=99=BB=EF=B8=8F=20refactor:=20create=20ops?=
 =?UTF-8?q?=20module=20and=20move=20chunked=5Fattention?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Create nanovllm/ops/ module for low-level attention operators
- Move chunked_attention.py from kvcache/ to ops/
- Update imports in full_policy.py (3 locations)
- Fix: remove dead code in OffloadEngine.reset() referencing
  non-existent layer_k/v_buffer_a/b attributes

Verified with needle test (32K offload): PASSED

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 nanovllm/kvcache/offload_engine.py            |  7 -------
 nanovllm/kvcache/sparse/full_policy.py        |  6 +++---
 nanovllm/ops/__init__.py                      | 19 +++++++++++++++++++
 .../{kvcache => ops}/chunked_attention.py     |  0
 4 files changed, 22 insertions(+), 10 deletions(-)
 create mode 100644 nanovllm/ops/__init__.py
 rename nanovllm/{kvcache => ops}/chunked_attention.py (100%)

diff --git a/nanovllm/kvcache/offload_engine.py b/nanovllm/kvcache/offload_engine.py
index 5e4423e..64142ba 100644
--- a/nanovllm/kvcache/offload_engine.py
+++ b/nanovllm/kvcache/offload_engine.py
@@ -255,7 +255,6 @@ class OffloadEngine:
         Clears:
         - GPU ring buffer slots (k_cache_gpu, v_cache_gpu)
         - Per-layer decode buffers (decode_k_buffer, decode_v_buffer)
-        - Cross-layer pipeline buffers (layer_k/v_buffer_a/b)
         - Per-layer prefill buffers (prefill_k/v_buffer)
         - All pending async transfer events
         """
@@ -267,12 +266,6 @@ class OffloadEngine:
         self.decode_k_buffer.zero_()
         self.decode_v_buffer.zero_()
 
-        # Clear cross-layer pipeline buffers
-        self.layer_k_buffer_a.zero_()
-        self.layer_v_buffer_a.zero_()
-        self.layer_k_buffer_b.zero_()
-        self.layer_v_buffer_b.zero_()
-
         # Clear per-layer prefill buffers
         self.prefill_k_buffer.zero_()
         self.prefill_v_buffer.zero_()
diff --git a/nanovllm/kvcache/sparse/full_policy.py b/nanovllm/kvcache/sparse/full_policy.py
index 514842f..ff99133 100644
--- a/nanovllm/kvcache/sparse/full_policy.py
+++ b/nanovllm/kvcache/sparse/full_policy.py
@@ -84,7 +84,7 @@ class FullAttentionPolicy(SparsePolicy):
         Returns:
             Attention output [seq_len, num_heads, head_dim]
         """
-        from nanovllm.kvcache.chunked_attention import flash_attn_with_lse, merge_attention_outputs
+        from nanovllm.ops.chunked_attention import flash_attn_with_lse, merge_attention_outputs
 
         logger.debug(f"[DEBUG] FullPolicy.compute_chunked_prefill called, "
                      f"layer={layer_id}, chunk={current_chunk_idx}, num_tokens={num_tokens}")
@@ -222,7 +222,7 @@ class FullAttentionPolicy(SparsePolicy):
         Returns:
             Attention output [batch_size, 1, num_heads, head_dim]
         """
-        from nanovllm.kvcache.chunked_attention import flash_attn_with_lse, merge_attention_outputs
+        from nanovllm.ops.chunked_attention import flash_attn_with_lse, merge_attention_outputs
 
         # q shape: [batch_size, num_heads, head_dim] (single decode token per sequence)
         q_batched = q.unsqueeze(1)  # [batch, 1, heads, dim]
@@ -319,7 +319,7 @@ class FullAttentionPolicy(SparsePolicy):
         Loads one block at a time, computes attention, and merges results.
         Uses load_to_slot_layer / wait_slot_layer / get_kv_for_slot methods.
         """
-        from nanovllm.kvcache.chunked_attention import flash_attn_with_lse, merge_attention_outputs
+        from nanovllm.ops.chunked_attention import flash_attn_with_lse, merge_attention_outputs
 
         num_blocks = len(cpu_block_table)
         if num_blocks == 0:
diff --git a/nanovllm/ops/__init__.py b/nanovllm/ops/__init__.py
new file mode 100644
index 0000000..171cd29
--- /dev/null
+++ b/nanovllm/ops/__init__.py
@@ -0,0 +1,19 @@
+"""
+Operators module for nano-vLLM.
+
+This module contains low-level attention operators and kernels.
+"""
+
+from nanovllm.ops.chunked_attention import (
+    flash_attn_with_lse,
+    merge_attention_outputs,
+    chunked_attention_varlen,
+    ChunkedPrefillState,
+)
+
+__all__ = [
+    "flash_attn_with_lse",
+    "merge_attention_outputs",
+    "chunked_attention_varlen",
+    "ChunkedPrefillState",
+]
diff --git a/nanovllm/kvcache/chunked_attention.py b/nanovllm/ops/chunked_attention.py
similarity index 100%
rename from nanovllm/kvcache/chunked_attention.py
rename to nanovllm/ops/chunked_attention.py