♻️ refactor: create ops module and move chunked_attention

- Create nanovllm/ops/ module for low-level attention operators - Move chunked_attention.py from kvcache/ to ops/ - Update imports in full_policy.py (3 locations) - Fix: remove dead code in OffloadEngine.reset() referencing non-existent layer_k/v_buffer_a/b attributes Verified with needle test (32K offload): PASSED Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-20 02:50:14 +08:00
parent e440c45e73
commit 690456dbf9
4 changed files with 22 additions and 10 deletions
--- a/nanovllm/kvcache/sparse/full_policy.py
+++ b/nanovllm/kvcache/sparse/full_policy.py
@@ -84,7 +84,7 @@ class FullAttentionPolicy(SparsePolicy):
        Returns:
            Attention output [seq_len, num_heads, head_dim]
        """
-        from nanovllm.kvcache.chunked_attention import flash_attn_with_lse, merge_attention_outputs
+        from nanovllm.ops.chunked_attention import flash_attn_with_lse, merge_attention_outputs

        logger.debug(f"[DEBUG] FullPolicy.compute_chunked_prefill called, "
                     f"layer={layer_id}, chunk={current_chunk_idx}, num_tokens={num_tokens}")
@@ -222,7 +222,7 @@ class FullAttentionPolicy(SparsePolicy):
        Returns:
            Attention output [batch_size, 1, num_heads, head_dim]
        """
-        from nanovllm.kvcache.chunked_attention import flash_attn_with_lse, merge_attention_outputs
+        from nanovllm.ops.chunked_attention import flash_attn_with_lse, merge_attention_outputs

        # q shape: [batch_size, num_heads, head_dim] (single decode token per sequence)
        q_batched = q.unsqueeze(1)  # [batch, 1, heads, dim]
@@ -319,7 +319,7 @@ class FullAttentionPolicy(SparsePolicy):
        Loads one block at a time, computes attention, and merges results.
        Uses load_to_slot_layer / wait_slot_layer / get_kv_for_slot methods.
        """
-        from nanovllm.kvcache.chunked_attention import flash_attn_with_lse, merge_attention_outputs
+        from nanovllm.ops.chunked_attention import flash_attn_with_lse, merge_attention_outputs

        num_blocks = len(cpu_block_table)
        if num_blocks == 0: