[claudesquad] update from 'int-minference-1' on 08 Jan 26 23:22 CST

2026-01-08 23:22:38 +08:00
parent 0bfe1984ef
commit ea4e904de0
11 changed files with 853 additions and 533 deletions
--- a/nanovllm/engine/model_runner.py
+++ b/nanovllm/engine/model_runner.py
@@ -531,16 +531,23 @@ class ModelRunner:
                # RoPE
                q, k = layer.self_attn.rotary_emb(positions, q, k)

-                # Full attention using FlashAttention
-                attn_output = flash_attn_varlen_func(
-                    q, k, v,
-                    cu_seqlens_q=cu_seqlens,
-                    cu_seqlens_k=cu_seqlens,
-                    max_seqlen_q=total_tokens,
-                    max_seqlen_k=total_tokens,
-                    softmax_scale=layer.self_attn.attn.scale,
-                    causal=True,
-                )
+                # Sparse or Full attention
+                if self.sparse_prefill_policy is not None:
+                    # MInference or other sparse prefill policy
+                    attn_output = self.sparse_prefill_policy.sparse_prefill_attention(
+                        q, k, v, layer_id
+                    )
+                else:
+                    # Full attention using FlashAttention
+                    attn_output = flash_attn_varlen_func(
+                        q, k, v,
+                        cu_seqlens_q=cu_seqlens,
+                        cu_seqlens_k=cu_seqlens,
+                        max_seqlen_q=total_tokens,
+                        max_seqlen_k=total_tokens,
+                        softmax_scale=layer.self_attn.attn.scale,
+                        causal=True,
+                    )

                # O projection
                attn_output = attn_output.view(total_tokens, -1)
@@ -550,16 +557,8 @@ class ModelRunner:
                hidden_states, residual = layer.post_attention_layernorm(hidden_states, residual)
                hidden_states = layer.mlp(hidden_states)

-                # 2d. Offload KV to CPU (synchronous to avoid race condition)
-                # NOTE: Async offload has race condition where k,v memory gets reused
-                # before D2H copy completes. Use sync copy for correctness.
-                block_size = offload_engine.block_size
-                for i, cpu_block_id in enumerate(cpu_block_ids):
-                    start = i * block_size
-                    end = min(start + block_size, total_tokens)
-                    actual_size = end - start
-                    offload_engine.k_cache_cpu[layer_id, cpu_block_id, :actual_size].copy_(k[start:end])
-                    offload_engine.v_cache_cpu[layer_id, cpu_block_id, :actual_size].copy_(v[start:end])
+                # 2d. Offload KV to CPU (encapsulated with sparse policy hooks)
+                offload_engine.offload_layer_kv_sync(layer_id, k, v, cpu_block_ids, total_tokens)

            # Step 3: Final norm
            hidden_states, _ = self.model.model.norm(hidden_states, residual)