Merge branch 'tzj/minference' of ssh://git.zijie-tian.site:2222/zijie-tian/nano-vllm into tzj/minference
This commit is contained in:
@@ -7,8 +7,9 @@ import torch
|
||||
|
||||
class SparsePolicyType(Enum):
|
||||
"""Sparse attention policy types."""
|
||||
FULL = auto() # No sparse attention (load all blocks)
|
||||
QUEST = auto() # Query-aware Top-K block selection (decode only)
|
||||
FULL = auto() # No sparse attention (load all blocks)
|
||||
QUEST = auto() # Query-aware Top-K block selection (decode only)
|
||||
XATTN_BSA = auto() # XAttention Block Sparse Attention (prefill only, chunked)
|
||||
|
||||
|
||||
@dataclass
|
||||
@@ -37,12 +38,20 @@ class Config:
|
||||
num_cpu_kvcache_blocks: int = -1
|
||||
|
||||
# Sparse attention configuration
|
||||
# Quest: decode-only sparse attention with Top-K block selection
|
||||
# FULL: no sparse attention (load all blocks)
|
||||
# QUEST: decode-only sparse attention with Top-K block selection
|
||||
# XATTN_BSA: prefill-only block sparse attention with chunk-level selection
|
||||
sparse_policy: SparsePolicyType = SparsePolicyType.FULL
|
||||
sparse_topk_blocks: int = 8 # Top-K blocks for Quest
|
||||
sparse_threshold_blocks: int = 4 # Apply sparse only when blocks > threshold
|
||||
|
||||
# XAttention BSA specific parameters
|
||||
sparse_block_size: int = 128 # Block size for BSA (tokens per block)
|
||||
sparse_samples_per_chunk: int = 128 # Samples per chunk for estimation
|
||||
sparse_threshold: float = 0.9 # Cumulative attention threshold (0-1)
|
||||
sparse_use_triton: bool = True # Use Triton kernels for estimation
|
||||
sparse_stride: int = 8 # Stride for Q/K downsampling
|
||||
|
||||
def __post_init__(self):
|
||||
assert os.path.isdir(self.model)
|
||||
assert self.kvcache_block_size % 256 == 0
|
||||
|
||||
@@ -142,8 +142,26 @@ class ModelRunner:
|
||||
block_bytes = 2 * hf_config.num_hidden_layers * self.block_size * num_kv_heads * head_dim * hf_config.torch_dtype.itemsize
|
||||
|
||||
# Calculate max GPU blocks based on available memory
|
||||
max_gpu_blocks = int(total * config.gpu_memory_utilization - used - peak + current) // block_bytes
|
||||
assert max_gpu_blocks > 0
|
||||
# In CPU offload mode with shared GPU, use actual free memory instead of total * utilization
|
||||
if config.enable_cpu_offload and used > total * 0.5:
|
||||
# GPU is shared with other processes, use actual free memory
|
||||
available_memory = free * 0.9 # Leave 10% buffer
|
||||
else:
|
||||
# Standard calculation for dedicated GPU usage
|
||||
available_memory = total * config.gpu_memory_utilization - used - peak + current
|
||||
|
||||
max_gpu_blocks = int(available_memory) // block_bytes
|
||||
|
||||
if max_gpu_blocks <= 0:
|
||||
raise RuntimeError(
|
||||
f"Insufficient GPU memory for KV cache allocation. "
|
||||
f"Total: {total/1024**3:.2f} GB, "
|
||||
f"Used by other processes: {used/1024**3:.2f} GB, "
|
||||
f"Free: {free/1024**3:.2f} GB, "
|
||||
f"Available: {available_memory/1024**3:.2f} GB, "
|
||||
f"Required per block: {block_bytes/1024**2:.2f} MB. "
|
||||
f"Try waiting for GPU to be available or reduce model size."
|
||||
)
|
||||
|
||||
# Determine final GPU blocks: user-specified or auto (max available)
|
||||
if config.num_gpu_blocks > 0:
|
||||
|
||||
@@ -64,11 +64,24 @@ def create_kvcache_manager(config: "Config") -> KVCacheManager:
|
||||
# Create sparse policy from config enum
|
||||
# Quest is decode-only: prefill returns all blocks (query=None), decode does Top-K
|
||||
sparse_policy_type = getattr(config, 'sparse_policy', SparsePolicyType.FULL)
|
||||
sparse_policy = create_sparse_policy(
|
||||
sparse_policy_type,
|
||||
topk_blocks=getattr(config, 'sparse_topk_blocks', 8),
|
||||
threshold_blocks=getattr(config, 'sparse_threshold_blocks', 4),
|
||||
)
|
||||
|
||||
# Build policy kwargs based on policy type
|
||||
policy_kwargs = {}
|
||||
if sparse_policy_type == SparsePolicyType.QUEST:
|
||||
policy_kwargs = {
|
||||
'topk_blocks': getattr(config, 'sparse_topk_blocks', 8),
|
||||
'threshold_blocks': getattr(config, 'sparse_threshold_blocks', 4),
|
||||
}
|
||||
elif sparse_policy_type == SparsePolicyType.XATTN_BSA:
|
||||
policy_kwargs = {
|
||||
'block_size': getattr(config, 'sparse_block_size', 128),
|
||||
'samples_per_chunk': getattr(config, 'sparse_samples_per_chunk', 128),
|
||||
'threshold': getattr(config, 'sparse_threshold', 0.9),
|
||||
'use_triton': getattr(config, 'sparse_use_triton', True),
|
||||
'stride': getattr(config, 'sparse_stride', 8),
|
||||
}
|
||||
|
||||
sparse_policy = create_sparse_policy(sparse_policy_type, **policy_kwargs)
|
||||
|
||||
return HybridKVCacheManager(
|
||||
num_gpu_slots=num_gpu_blocks,
|
||||
|
||||
@@ -905,3 +905,60 @@ class OffloadEngine:
|
||||
def wait_prefill_offload(self, layer_id: int) -> None:
|
||||
"""Wait for a specific layer's prefill offload to complete."""
|
||||
self.prefill_offload_events[layer_id].synchronize()
|
||||
|
||||
# ========== XAttention BSA Helper Methods ==========
|
||||
|
||||
def load_block_sample_from_cpu(
|
||||
self,
|
||||
cpu_block_id: int,
|
||||
layer_id: int,
|
||||
num_samples: int,
|
||||
) -> Tuple[Tensor, Tensor]:
|
||||
"""
|
||||
Load sample tokens from a CPU block for XAttention BSA estimation.
|
||||
|
||||
This is used in the estimate phase of XAttention BSA to load a small
|
||||
sample of tokens from each historical chunk for importance estimation.
|
||||
|
||||
Args:
|
||||
cpu_block_id: Source CPU block ID
|
||||
layer_id: Layer index
|
||||
num_samples: Number of tokens to sample
|
||||
|
||||
Returns:
|
||||
(k_sample, v_sample) tensors, shape: [num_samples, kv_heads, head_dim]
|
||||
"""
|
||||
# Sample from the beginning of the block
|
||||
k_sample = self.k_cache_cpu[
|
||||
layer_id, cpu_block_id, :num_samples
|
||||
].clone().cuda()
|
||||
v_sample = self.v_cache_cpu[
|
||||
layer_id, cpu_block_id, :num_samples
|
||||
].clone().cuda()
|
||||
return k_sample, v_sample
|
||||
|
||||
def load_block_full_from_cpu(
|
||||
self,
|
||||
cpu_block_id: int,
|
||||
layer_id: int,
|
||||
) -> Tuple[Tensor, Tensor]:
|
||||
"""
|
||||
Load full tokens from a CPU block for XAttention BSA computation.
|
||||
|
||||
This is used in the compute phase of XAttention BSA to load the full
|
||||
data for selected important chunks.
|
||||
|
||||
Args:
|
||||
cpu_block_id: Source CPU block ID
|
||||
layer_id: Layer index
|
||||
|
||||
Returns:
|
||||
(k_full, v_full) tensors, shape: [block_size, kv_heads, head_dim]
|
||||
"""
|
||||
k_full = self.k_cache_cpu[
|
||||
layer_id, cpu_block_id
|
||||
].clone().cuda()
|
||||
v_full = self.v_cache_cpu[
|
||||
layer_id, cpu_block_id
|
||||
].clone().cuda()
|
||||
return k_full, v_full
|
||||
|
||||
@@ -23,6 +23,7 @@ from nanovllm.config import SparsePolicyType
|
||||
from nanovllm.kvcache.sparse.policy import SparsePolicy, PolicyContext
|
||||
from nanovllm.kvcache.sparse.full_policy import FullAttentionPolicy
|
||||
from nanovllm.kvcache.sparse.quest import QuestPolicy, QuestConfig, BlockMetadataManager
|
||||
from nanovllm.kvcache.sparse.xattn_bsa import XAttentionBSAPolicy
|
||||
|
||||
|
||||
def create_sparse_policy(policy_type: SparsePolicyType, **kwargs) -> SparsePolicy:
|
||||
@@ -55,6 +56,13 @@ def create_sparse_policy(policy_type: SparsePolicyType, **kwargs) -> SparsePolic
|
||||
)
|
||||
return QuestPolicy(config)
|
||||
|
||||
elif policy_type == SparsePolicyType.XATTN_BSA:
|
||||
return XAttentionBSAPolicy(
|
||||
block_size=kwargs.get("block_size", 128),
|
||||
samples_per_chunk=kwargs.get("samples_per_chunk", 128),
|
||||
threshold=kwargs.get("threshold", 0.9),
|
||||
)
|
||||
|
||||
else:
|
||||
raise ValueError(f"Unknown policy type: {policy_type}")
|
||||
|
||||
@@ -67,5 +75,6 @@ __all__ = [
|
||||
"QuestPolicy",
|
||||
"QuestConfig",
|
||||
"BlockMetadataManager",
|
||||
"XAttentionBSAPolicy",
|
||||
"create_sparse_policy",
|
||||
]
|
||||
|
||||
@@ -5,8 +5,19 @@ This serves as a baseline and default policy when sparse
|
||||
attention is not needed.
|
||||
"""
|
||||
|
||||
from typing import List
|
||||
import logging
|
||||
import torch
|
||||
from typing import List, Optional, TYPE_CHECKING
|
||||
|
||||
from .policy import SparsePolicy, PolicyContext
|
||||
from nanovllm.utils.context import get_context
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from nanovllm.kvcache.offload_engine import OffloadEngine
|
||||
from nanovllm.kvcache.manager import KVCacheManager
|
||||
from nanovllm.engine.sequence import Sequence
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class FullAttentionPolicy(SparsePolicy):
|
||||
@@ -29,10 +40,157 @@ class FullAttentionPolicy(SparsePolicy):
|
||||
def select_blocks(
|
||||
self,
|
||||
available_blocks: List[int],
|
||||
offload_engine: "OffloadEngine",
|
||||
ctx: PolicyContext,
|
||||
) -> List[int]:
|
||||
"""Return all blocks - no sparsity."""
|
||||
return available_blocks
|
||||
|
||||
def compute_chunked_attention(
|
||||
self,
|
||||
q: torch.Tensor,
|
||||
k: torch.Tensor,
|
||||
v: torch.Tensor,
|
||||
layer_id: int,
|
||||
softmax_scale: float,
|
||||
offload_engine: "OffloadEngine",
|
||||
kvcache_manager: "KVCacheManager",
|
||||
current_chunk_idx: int,
|
||||
seq: "Sequence",
|
||||
num_tokens: int,
|
||||
) -> torch.Tensor:
|
||||
"""
|
||||
Compute full attention for chunked prefill.
|
||||
|
||||
This method handles the complete chunked prefill flow:
|
||||
1. Get historical blocks
|
||||
2. Select blocks via select_blocks
|
||||
3. Load and compute attention to historical chunks
|
||||
4. Compute attention to current chunk
|
||||
5. Merge all results
|
||||
|
||||
Args:
|
||||
q: Query tensor [seq_len, num_heads, head_dim]
|
||||
k: Key tensor [seq_len, num_kv_heads, head_dim] (unused, from prefill buffer)
|
||||
v: Value tensor [seq_len, num_kv_heads, head_dim] (unused, from prefill buffer)
|
||||
layer_id: Current layer index
|
||||
softmax_scale: Softmax scaling factor
|
||||
offload_engine: OffloadEngine for loading blocks
|
||||
kvcache_manager: KVCacheManager for block management
|
||||
current_chunk_idx: Current chunk index
|
||||
seq: Sequence object
|
||||
num_tokens: Number of tokens in current chunk
|
||||
|
||||
Returns:
|
||||
Attention output [seq_len, num_heads, head_dim]
|
||||
"""
|
||||
from nanovllm.kvcache.chunked_attention import flash_attn_with_lse, merge_attention_outputs
|
||||
|
||||
logger.debug(f"[DEBUG] FullPolicy.compute_chunked_attention called, "
|
||||
f"layer={layer_id}, chunk={current_chunk_idx}, num_tokens={num_tokens}")
|
||||
|
||||
q_batched = q.unsqueeze(0) # [1, seq_len, num_heads, head_dim]
|
||||
o_acc = None
|
||||
lse_acc = None
|
||||
compute_stream = offload_engine.compute_stream
|
||||
|
||||
# Step 1: Get historical blocks
|
||||
cpu_block_table = kvcache_manager.get_prefilled_cpu_blocks(seq)
|
||||
|
||||
# Step 2: Apply select_blocks to filter blocks
|
||||
if cpu_block_table:
|
||||
num_chunks = current_chunk_idx + 1
|
||||
policy_ctx = PolicyContext(
|
||||
query_chunk_idx=current_chunk_idx,
|
||||
num_query_chunks=num_chunks,
|
||||
layer_id=layer_id,
|
||||
query=None, # Prefill typically doesn't use query for selection
|
||||
is_prefill=True,
|
||||
block_size=kvcache_manager.block_size,
|
||||
total_kv_len=len(cpu_block_table) * kvcache_manager.block_size,
|
||||
)
|
||||
cpu_block_table = self.select_blocks(cpu_block_table, offload_engine, policy_ctx)
|
||||
logger.debug(f"[DEBUG] select_blocks: output={len(cpu_block_table)} blocks")
|
||||
|
||||
if cpu_block_table:
|
||||
load_slots = list(range(offload_engine.num_ring_slots))
|
||||
num_blocks = len(cpu_block_table)
|
||||
|
||||
if len(load_slots) == 1:
|
||||
# Only 1 slot - use synchronous mode
|
||||
slot = load_slots[0]
|
||||
for block_idx in range(num_blocks):
|
||||
cpu_block_id = cpu_block_table[block_idx]
|
||||
offload_engine.load_to_slot_layer(slot, layer_id, cpu_block_id)
|
||||
offload_engine.wait_slot_layer(slot)
|
||||
|
||||
with torch.cuda.stream(compute_stream):
|
||||
prev_k, prev_v = offload_engine.get_kv_for_slot(slot)
|
||||
prev_o, prev_lse = flash_attn_with_lse(
|
||||
q_batched, prev_k, prev_v,
|
||||
softmax_scale=softmax_scale,
|
||||
causal=False,
|
||||
)
|
||||
if o_acc is None:
|
||||
o_acc, lse_acc = prev_o, prev_lse
|
||||
else:
|
||||
o_acc, lse_acc = merge_attention_outputs(o_acc, lse_acc, prev_o, prev_lse)
|
||||
offload_engine.record_slot_compute_done(slot)
|
||||
else:
|
||||
# Multiple slots - use pipeline
|
||||
num_slots = len(load_slots)
|
||||
num_preload = min(num_slots, num_blocks)
|
||||
for i in range(num_preload):
|
||||
offload_engine.load_to_slot_layer(load_slots[i], layer_id, cpu_block_table[i])
|
||||
|
||||
for block_idx in range(num_blocks):
|
||||
current_slot = load_slots[block_idx % num_slots]
|
||||
cpu_block_id = cpu_block_table[block_idx]
|
||||
|
||||
offload_engine.wait_slot_layer(current_slot)
|
||||
|
||||
with torch.cuda.stream(compute_stream):
|
||||
prev_k, prev_v = offload_engine.get_kv_for_slot(current_slot)
|
||||
prev_o, prev_lse = flash_attn_with_lse(
|
||||
q_batched, prev_k, prev_v,
|
||||
softmax_scale=softmax_scale,
|
||||
causal=False,
|
||||
)
|
||||
offload_engine.record_slot_compute_done(current_slot)
|
||||
|
||||
if o_acc is None:
|
||||
o_acc, lse_acc = prev_o, prev_lse
|
||||
else:
|
||||
o_acc, lse_acc = merge_attention_outputs(o_acc, lse_acc, prev_o, prev_lse)
|
||||
|
||||
# Issue next transfer
|
||||
next_block_idx = block_idx + num_slots
|
||||
if next_block_idx < num_blocks:
|
||||
next_slot = load_slots[next_block_idx % num_slots]
|
||||
next_cpu_block_id = cpu_block_table[next_block_idx]
|
||||
offload_engine.load_to_slot_layer(next_slot, layer_id, next_cpu_block_id)
|
||||
|
||||
# Step 4: Compute attention to current chunk (causal mask)
|
||||
with torch.cuda.stream(compute_stream):
|
||||
k_curr, v_curr = offload_engine.get_prefill_buffer_slice(layer_id, num_tokens)
|
||||
current_o, current_lse = flash_attn_with_lse(
|
||||
q_batched, k_curr, v_curr,
|
||||
softmax_scale=softmax_scale,
|
||||
causal=True,
|
||||
)
|
||||
|
||||
# Step 5: Merge historical and current attention
|
||||
with torch.cuda.stream(compute_stream):
|
||||
if o_acc is None:
|
||||
final_o = current_o
|
||||
else:
|
||||
final_o, _ = merge_attention_outputs(o_acc, lse_acc, current_o, current_lse)
|
||||
|
||||
# Sync default stream with compute_stream before returning
|
||||
torch.cuda.default_stream().wait_stream(compute_stream)
|
||||
|
||||
# Remove batch dimension: [1, seq_len, num_heads, head_dim] -> [seq_len, num_heads, head_dim]
|
||||
return final_o.squeeze(0)
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return "FullAttentionPolicy()"
|
||||
|
||||
@@ -7,12 +7,17 @@ from CPU for each query chunk during chunked attention computation.
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
from dataclasses import dataclass
|
||||
from typing import List, Optional, Any
|
||||
from typing import List, Optional, Any, TYPE_CHECKING
|
||||
import torch
|
||||
|
||||
# Import SparsePolicyType from config to avoid circular imports
|
||||
from nanovllm.config import SparsePolicyType
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from nanovllm.kvcache.offload_engine import OffloadEngine
|
||||
from nanovllm.kvcache.manager import KVCacheManager
|
||||
from nanovllm.engine.sequence import Sequence
|
||||
|
||||
|
||||
@dataclass
|
||||
class PolicyContext:
|
||||
@@ -35,8 +40,8 @@ class PolicyContext:
|
||||
query: Optional[torch.Tensor]
|
||||
"""
|
||||
Query tensor for current chunk.
|
||||
Shape: [1, num_heads, head_dim] for decode, [1, seq_len, num_heads, head_dim] for prefill.
|
||||
May be None if not available (e.g., some prefill scenarios).
|
||||
Shape: [1, num_heads, head_dim] for decode, [seq_len, num_heads, head_dim] for prefill.
|
||||
Available for both prefill and decode phases.
|
||||
"""
|
||||
|
||||
is_prefill: bool
|
||||
@@ -107,6 +112,7 @@ class SparsePolicy(ABC):
|
||||
def select_blocks(
|
||||
self,
|
||||
available_blocks: List[int],
|
||||
offload_engine: "OffloadEngine",
|
||||
ctx: PolicyContext,
|
||||
) -> List[int]:
|
||||
"""
|
||||
@@ -120,6 +126,8 @@ class SparsePolicy(ABC):
|
||||
available_blocks: List of CPU block IDs that contain KV cache
|
||||
from previous chunks. These are ordered by
|
||||
their position in the sequence.
|
||||
offload_engine: OffloadEngine for loading KV (some policies need
|
||||
to load KV to make selection decisions).
|
||||
ctx: PolicyContext with information about the current query
|
||||
chunk, layer, phase (prefill/decode), etc.
|
||||
|
||||
@@ -183,5 +191,47 @@ class SparsePolicy(ABC):
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def compute_chunked_attention(
|
||||
self,
|
||||
q: torch.Tensor,
|
||||
k: torch.Tensor,
|
||||
v: torch.Tensor,
|
||||
layer_id: int,
|
||||
softmax_scale: float,
|
||||
offload_engine: "OffloadEngine",
|
||||
kvcache_manager: "KVCacheManager",
|
||||
current_chunk_idx: int,
|
||||
seq: "Sequence",
|
||||
num_tokens: int,
|
||||
) -> torch.Tensor:
|
||||
"""
|
||||
Compute chunked prefill attention (complete flow).
|
||||
|
||||
This is the main entry point for prefill attention computation.
|
||||
It defines the complete prefill flow:
|
||||
1. Get historical blocks
|
||||
2. Select blocks (call select_blocks)
|
||||
3. Load and compute historical blocks via offload_engine
|
||||
4. Get current chunk KV from offload_engine, compute attention
|
||||
5. Merge all results
|
||||
|
||||
Args:
|
||||
q: [seq_len, num_heads, head_dim] query for current chunk
|
||||
k: [seq_len, num_kv_heads, head_dim] key for current chunk (in prefill buffer)
|
||||
v: [seq_len, num_kv_heads, head_dim] value for current chunk (in prefill buffer)
|
||||
layer_id: transformer layer index
|
||||
softmax_scale: softmax scaling factor
|
||||
offload_engine: OffloadEngine for loading blocks
|
||||
kvcache_manager: KVCacheManager for block management
|
||||
current_chunk_idx: current chunk index
|
||||
seq: Sequence object
|
||||
num_tokens: number of tokens in current chunk
|
||||
|
||||
Returns:
|
||||
[seq_len, num_heads, head_dim] final attention output
|
||||
"""
|
||||
pass
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f"{self.__class__.__name__}()"
|
||||
|
||||
70
nanovllm/kvcache/sparse/xattn_bsa.py
Normal file
70
nanovllm/kvcache/sparse/xattn_bsa.py
Normal file
@@ -0,0 +1,70 @@
|
||||
"""
|
||||
XAttention Block Sparse Attention (BSA) Policy for nano-vllm.
|
||||
|
||||
This module implements XAttention-inspired block sparse attention for chunked prefill.
|
||||
Current implementation loads all historical blocks (FULL strategy).
|
||||
|
||||
Sparse selection to be implemented in next phase.
|
||||
"""
|
||||
|
||||
import torch
|
||||
from typing import List, Optional, Tuple
|
||||
|
||||
from nanovllm.kvcache.sparse.policy import SparsePolicy, PolicyContext
|
||||
from nanovllm.utils.context import get_context
|
||||
|
||||
|
||||
class XAttentionBSAPolicy(SparsePolicy):
|
||||
"""
|
||||
XAttention Block Sparse Attention policy for chunked prefill.
|
||||
|
||||
This policy uses block-level estimation to determine which KV blocks
|
||||
are important for the current chunk's queries, enabling sparse computation.
|
||||
|
||||
Note: Current implementation loads all historical chunks (FULL strategy).
|
||||
Sparse selection to be implemented in next phase.
|
||||
"""
|
||||
|
||||
supports_prefill = False # Uses standard select_blocks interface
|
||||
supports_decode = False # BSA is prefill-only
|
||||
requires_block_selection = False # Selection happens at chunk level, not block level
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
block_size: int = 128,
|
||||
samples_per_chunk: int = 128,
|
||||
threshold: float = 0.9,
|
||||
):
|
||||
"""
|
||||
Initialize XAttention BSA policy.
|
||||
|
||||
Args:
|
||||
block_size: Number of tokens per block (default: 128)
|
||||
samples_per_chunk: Number of tokens to sample from each historical chunk for estimation
|
||||
threshold: Cumulative attention threshold for chunk selection (0-1)
|
||||
"""
|
||||
self.block_size = block_size
|
||||
self.samples_per_chunk = samples_per_chunk
|
||||
self.threshold = threshold
|
||||
|
||||
def select_blocks(self, available_blocks: List[int], ctx: PolicyContext) -> List[int]:
|
||||
"""
|
||||
Select blocks to load from CPU.
|
||||
|
||||
Current implementation returns all blocks (FULL strategy).
|
||||
Sparse selection to be implemented in next phase.
|
||||
|
||||
Args:
|
||||
available_blocks: List of all available CPU block IDs
|
||||
ctx: Policy context with query info, chunk index, etc.
|
||||
|
||||
Returns:
|
||||
List of selected block IDs to load
|
||||
"""
|
||||
# Current: Return all blocks (FULL strategy)
|
||||
# TODO: Implement sparse selection based on query attention estimation
|
||||
return available_blocks
|
||||
|
||||
def reset(self) -> None:
|
||||
"""Reset policy state."""
|
||||
pass
|
||||
@@ -174,116 +174,45 @@ class Attention(nn.Module):
|
||||
"""
|
||||
Compute attention with per-layer prefill buffer for async offload.
|
||||
|
||||
Optimized design:
|
||||
- Current chunk's KV is written to per-layer prefill buffer (not GPU slot)
|
||||
- Previous chunks' KV are loaded from CPU using GPU slots
|
||||
- Each layer offloads from its own buffer - no waiting required!
|
||||
Simplified design:
|
||||
- All computation logic is delegated to sparse_policy.compute_chunked_attention()
|
||||
- This method only handles async offload after computation
|
||||
|
||||
For each layer:
|
||||
1. Current chunk's KV is in prefill_buffer[layer_id] (just written by model)
|
||||
2. Load previous chunks from CPU using available slots (pipeline)
|
||||
3. Compute attention against previous KV (no causal mask)
|
||||
4. Compute attention against current KV from prefill buffer (causal)
|
||||
5. Merge all results using online softmax
|
||||
6. Async offload prefill buffer to CPU (no waiting!)
|
||||
The policy handles:
|
||||
1. Loading historical blocks from CPU
|
||||
2. Computing attention against historical KV (no causal mask)
|
||||
3. Computing attention against current KV from prefill buffer (causal)
|
||||
4. Merging all results
|
||||
"""
|
||||
from nanovllm.kvcache.chunked_attention import flash_attn_with_lse, merge_attention_outputs
|
||||
|
||||
current_chunk_idx = context.current_chunk_idx
|
||||
torch.cuda.nvtx.range_push(f"ChunkedPrefill: L{self.layer_id} Chunk{current_chunk_idx}")
|
||||
|
||||
# q shape: [total_tokens, num_heads, head_dim]
|
||||
q_batched = q.unsqueeze(0) # [1, total_tokens, heads, dim]
|
||||
num_tokens = k.shape[0]
|
||||
|
||||
o_acc = None
|
||||
lse_acc = None
|
||||
|
||||
kvcache_manager = context.kvcache_manager
|
||||
seq = context.chunked_seq if hasattr(context, 'chunked_seq') else None
|
||||
offload_engine = kvcache_manager.offload_engine if kvcache_manager is not None else None
|
||||
|
||||
if kvcache_manager is not None and seq is not None and self.layer_id >= 0:
|
||||
# Get prefilled CPU blocks (blocks from previous chunks)
|
||||
cpu_block_table = kvcache_manager.get_prefilled_cpu_blocks(seq)
|
||||
# Get sparse policy - required for chunked prefill
|
||||
sparse_policy = kvcache_manager.sparse_policy
|
||||
if sparse_policy is None:
|
||||
raise RuntimeError("sparse_policy is required for chunked prefill")
|
||||
|
||||
# Apply sparse policy if enabled (Quest returns all blocks for prefill since query=None)
|
||||
sparse_policy = kvcache_manager.sparse_policy
|
||||
if cpu_block_table and sparse_policy is not None:
|
||||
num_chunks = getattr(context, 'num_chunks', current_chunk_idx + 1)
|
||||
policy_ctx = PolicyContext(
|
||||
query_chunk_idx=current_chunk_idx,
|
||||
num_query_chunks=num_chunks,
|
||||
layer_id=self.layer_id,
|
||||
query=None, # Prefill typically doesn't use query for selection
|
||||
is_prefill=True,
|
||||
block_size=kvcache_manager.block_size,
|
||||
total_kv_len=len(cpu_block_table) * kvcache_manager.block_size,
|
||||
)
|
||||
cpu_block_table = sparse_policy.select_blocks(
|
||||
cpu_block_table, policy_ctx
|
||||
)
|
||||
# [DEBUG] Verify execution path
|
||||
logger.debug(f"[DEBUG] Calling sparse_policy.compute_chunked_attention, "
|
||||
f"policy={sparse_policy}, layer={self.layer_id}, chunk={current_chunk_idx}")
|
||||
|
||||
if cpu_block_table:
|
||||
# Get available load slots (all slots can be used since we use prefill buffer)
|
||||
load_slots = list(range(offload_engine.num_ring_slots))
|
||||
pipeline_depth = len(load_slots)
|
||||
|
||||
if pipeline_depth == 0:
|
||||
# Only 1 slot total, cannot pipeline - use sync loading
|
||||
o_acc, lse_acc = self._sync_load_previous_chunks(
|
||||
q_batched, cpu_block_table, offload_engine
|
||||
)
|
||||
else:
|
||||
# Use ring buffer pipeline
|
||||
o_acc, lse_acc = self._ring_buffer_pipeline_load(
|
||||
q_batched, cpu_block_table, load_slots, offload_engine,
|
||||
current_chunk_idx
|
||||
)
|
||||
|
||||
# Get compute stream for all attention operations
|
||||
compute_stream = offload_engine.compute_stream if offload_engine is not None else None
|
||||
|
||||
# Compute attention against current chunk's KV from prefill buffer (with causal mask)
|
||||
if compute_stream is not None:
|
||||
with torch.cuda.stream(compute_stream):
|
||||
torch.cuda.nvtx.range_push(f"FlashAttn: L{self.layer_id} CurrentChunk (causal)")
|
||||
# Get KV from per-layer prefill buffer
|
||||
k_batched, v_batched = offload_engine.get_prefill_buffer_slice(self.layer_id, num_tokens)
|
||||
current_o, current_lse = flash_attn_with_lse(
|
||||
q_batched,
|
||||
k_batched,
|
||||
v_batched,
|
||||
softmax_scale=self.scale,
|
||||
causal=True,
|
||||
)
|
||||
torch.cuda.nvtx.range_pop()
|
||||
else:
|
||||
torch.cuda.nvtx.range_push(f"FlashAttn: L{self.layer_id} CurrentChunk (causal)")
|
||||
k_batched = k.unsqueeze(0)
|
||||
v_batched = v.unsqueeze(0)
|
||||
current_o, current_lse = flash_attn_with_lse(
|
||||
q_batched,
|
||||
k_batched,
|
||||
v_batched,
|
||||
softmax_scale=self.scale,
|
||||
causal=True,
|
||||
)
|
||||
torch.cuda.nvtx.range_pop()
|
||||
|
||||
# Merge with accumulated (all on compute_stream for consistency)
|
||||
if o_acc is None:
|
||||
final_o = current_o
|
||||
else:
|
||||
if compute_stream is not None:
|
||||
with torch.cuda.stream(compute_stream):
|
||||
torch.cuda.nvtx.range_push(f"MergeAttn: L{self.layer_id}")
|
||||
final_o, _ = merge_attention_outputs(o_acc, lse_acc, current_o, current_lse)
|
||||
torch.cuda.nvtx.range_pop()
|
||||
else:
|
||||
torch.cuda.nvtx.range_push(f"MergeAttn: L{self.layer_id}")
|
||||
final_o, _ = merge_attention_outputs(o_acc, lse_acc, current_o, current_lse)
|
||||
torch.cuda.nvtx.range_pop()
|
||||
# Delegate all computation to policy (no flash_attn or merge calls here!)
|
||||
final_o = sparse_policy.compute_chunked_attention(
|
||||
q, k, v,
|
||||
self.layer_id,
|
||||
self.scale,
|
||||
offload_engine,
|
||||
kvcache_manager,
|
||||
current_chunk_idx,
|
||||
seq,
|
||||
num_tokens,
|
||||
)
|
||||
|
||||
torch.cuda.nvtx.range_pop() # ChunkedPrefill
|
||||
|
||||
@@ -298,181 +227,7 @@ class Attention(nn.Module):
|
||||
self.layer_id, cpu_block_id, num_tokens
|
||||
)
|
||||
|
||||
# Sync default stream with compute_stream before returning
|
||||
# This ensures the result is ready for the rest of the model (layernorm, MLP)
|
||||
if compute_stream is not None:
|
||||
torch.cuda.default_stream().wait_stream(compute_stream)
|
||||
|
||||
# Remove batch dimension: [1, total_tokens, heads, dim] -> [total_tokens, heads, dim]
|
||||
return final_o.squeeze(0)
|
||||
|
||||
def _sync_load_previous_chunks(
|
||||
self,
|
||||
q_batched: torch.Tensor,
|
||||
cpu_block_table: list,
|
||||
offload_engine,
|
||||
):
|
||||
"""Synchronous loading fallback when pipeline_depth=0."""
|
||||
from nanovllm.kvcache.chunked_attention import flash_attn_with_lse, merge_attention_outputs
|
||||
|
||||
o_acc, lse_acc = None, None
|
||||
compute_stream = offload_engine.compute_stream
|
||||
|
||||
for block_idx, cpu_block_id in enumerate(cpu_block_table):
|
||||
# Load to slot 0 (single slot)
|
||||
offload_engine.load_to_slot_layer(0, self.layer_id, cpu_block_id)
|
||||
offload_engine.wait_slot_layer(0)
|
||||
|
||||
# IMPORTANT: Must use compute_stream to match wait_slot_layer
|
||||
with torch.cuda.stream(compute_stream):
|
||||
prev_k, prev_v = offload_engine.get_kv_for_slot(0)
|
||||
|
||||
prev_o, prev_lse = flash_attn_with_lse(
|
||||
q_batched, prev_k, prev_v,
|
||||
softmax_scale=self.scale,
|
||||
causal=False,
|
||||
)
|
||||
|
||||
if o_acc is None:
|
||||
o_acc, lse_acc = prev_o, prev_lse
|
||||
else:
|
||||
o_acc, lse_acc = merge_attention_outputs(o_acc, lse_acc, prev_o, prev_lse)
|
||||
|
||||
return o_acc, lse_acc
|
||||
|
||||
def _ring_buffer_pipeline_load(
|
||||
self,
|
||||
q_batched: torch.Tensor,
|
||||
cpu_block_table: list,
|
||||
load_slots: list,
|
||||
offload_engine,
|
||||
current_chunk_idx: int = -1,
|
||||
):
|
||||
"""
|
||||
Ring buffer async pipeline loading with double buffering.
|
||||
|
||||
Uses compute_done events to ensure safe buffer reuse:
|
||||
- Before loading to slot X, wait for previous compute on slot X to finish
|
||||
- Before computing on slot X, wait for load to slot X to finish
|
||||
|
||||
Timeline with 2 slots (A, B):
|
||||
┌──────────────┐
|
||||
│ Load B0→A │
|
||||
└──────────────┘
|
||||
┌──────────────┐ ┌──────────────┐
|
||||
│ Load B1→B │ │ Load B2→A │ ...
|
||||
└──────────────┘ └──────────────┘
|
||||
↘ ↘
|
||||
┌──────────────┐ ┌──────────────┐
|
||||
│ Compute(A) │ │ Compute(B) │ ...
|
||||
└──────────────┘ └──────────────┘
|
||||
|
||||
The load_to_slot_layer internally waits for compute_done[slot] before
|
||||
starting the transfer, ensuring no data race.
|
||||
"""
|
||||
from nanovllm.kvcache.chunked_attention import flash_attn_with_lse, merge_attention_outputs
|
||||
|
||||
num_blocks = len(cpu_block_table)
|
||||
if num_blocks == 0:
|
||||
return None, None
|
||||
|
||||
pipeline_depth = len(load_slots)
|
||||
if pipeline_depth == 0:
|
||||
return None, None
|
||||
|
||||
o_acc, lse_acc = None, None
|
||||
|
||||
if pipeline_depth == 1:
|
||||
# Only 1 slot available, cannot pipeline - use synchronous mode
|
||||
# IMPORTANT: Must use compute_stream to match synchronization in
|
||||
# load_to_slot_layer (waits for compute_done) and wait_slot_layer
|
||||
slot = load_slots[0]
|
||||
compute_stream = offload_engine.compute_stream
|
||||
for block_idx in range(num_blocks):
|
||||
cpu_block_id = cpu_block_table[block_idx]
|
||||
offload_engine.load_to_slot_layer(slot, self.layer_id, cpu_block_id)
|
||||
offload_engine.wait_slot_layer(slot)
|
||||
|
||||
with torch.cuda.stream(compute_stream):
|
||||
# Debug: call hooks on compute_stream (synchronized with transfer)
|
||||
if offload_engine.debug_mode:
|
||||
offload_engine._call_debug_hooks(slot, self.layer_id, cpu_block_id)
|
||||
|
||||
prev_k, prev_v = offload_engine.get_kv_for_slot(slot)
|
||||
|
||||
prev_o, prev_lse = flash_attn_with_lse(
|
||||
q_batched, prev_k, prev_v,
|
||||
softmax_scale=self.scale,
|
||||
causal=False,
|
||||
)
|
||||
# Record compute done so next load can safely reuse this slot
|
||||
offload_engine.record_slot_compute_done(slot)
|
||||
if o_acc is None:
|
||||
o_acc, lse_acc = prev_o, prev_lse
|
||||
else:
|
||||
o_acc, lse_acc = merge_attention_outputs(o_acc, lse_acc, prev_o, prev_lse)
|
||||
return o_acc, lse_acc
|
||||
|
||||
# N-way pipeline: use ALL available slots for maximum overlap
|
||||
# Pipeline depth = num_slots - 1 (num_slots blocks in flight)
|
||||
num_slots = len(load_slots)
|
||||
|
||||
# Phase 1: Pre-load up to num_slots blocks to fill the pipeline
|
||||
# This starts all transfers in parallel, utilizing full PCIe bandwidth
|
||||
num_preload = min(num_slots, num_blocks)
|
||||
for i in range(num_preload):
|
||||
offload_engine.load_to_slot_layer(load_slots[i], self.layer_id, cpu_block_table[i])
|
||||
|
||||
# Phase 2: Main loop - compute and immediately reuse slot for next transfer
|
||||
# Use dedicated compute_stream (not default stream) to enable overlap with transfers
|
||||
compute_stream = offload_engine.compute_stream
|
||||
|
||||
for block_idx in range(num_blocks):
|
||||
torch.cuda.nvtx.range_push(f"PipelineBlock: L{self.layer_id} B{block_idx}")
|
||||
|
||||
# Cycle through slots: slot[block_idx % num_slots]
|
||||
current_slot = load_slots[block_idx % num_slots]
|
||||
cpu_block_id = cpu_block_table[block_idx]
|
||||
|
||||
# Wait for current slot's transfer to complete (on compute_stream)
|
||||
offload_engine.wait_slot_layer(current_slot)
|
||||
|
||||
# Compute attention on current slot's data
|
||||
# IMPORTANT: Use dedicated compute_stream to avoid implicit sync with default stream
|
||||
with torch.cuda.stream(compute_stream):
|
||||
# Debug: call hooks on compute_stream (synchronized with transfer)
|
||||
if offload_engine.debug_mode:
|
||||
offload_engine._call_debug_hooks(current_slot, self.layer_id, cpu_block_id)
|
||||
|
||||
torch.cuda.nvtx.range_push(f"FlashAttn: L{self.layer_id} PrevBlock{block_idx}")
|
||||
prev_k, prev_v = offload_engine.get_kv_for_slot(current_slot)
|
||||
|
||||
prev_o, prev_lse = flash_attn_with_lse(
|
||||
q_batched, prev_k, prev_v,
|
||||
softmax_scale=self.scale,
|
||||
causal=False,
|
||||
)
|
||||
torch.cuda.nvtx.range_pop()
|
||||
|
||||
# Record compute done - this allows the next transfer to safely overwrite this slot
|
||||
offload_engine.record_slot_compute_done(current_slot)
|
||||
|
||||
# Immediately start loading the NEXT block into this slot (if more blocks remain)
|
||||
# Key insight: reuse current_slot immediately after compute is done!
|
||||
next_block_idx = block_idx + num_slots
|
||||
if next_block_idx < num_blocks:
|
||||
offload_engine.load_to_slot_layer(current_slot, self.layer_id, cpu_block_table[next_block_idx])
|
||||
|
||||
# Merge with accumulated (also on compute_stream for consistency)
|
||||
with torch.cuda.stream(compute_stream):
|
||||
if o_acc is None:
|
||||
o_acc, lse_acc = prev_o, prev_lse
|
||||
else:
|
||||
o_acc, lse_acc = merge_attention_outputs(o_acc, lse_acc, prev_o, prev_lse)
|
||||
|
||||
torch.cuda.nvtx.range_pop() # PipelineBlock
|
||||
|
||||
return o_acc, lse_acc
|
||||
return final_o
|
||||
|
||||
def _chunked_decode_attention(
|
||||
self,
|
||||
@@ -517,6 +272,8 @@ class Attention(nn.Module):
|
||||
if last_block_valid_tokens == 0 and total_prefill_tokens > 0:
|
||||
last_block_valid_tokens = block_size # Last block was exactly full
|
||||
|
||||
offload_engine = kvcache_manager.offload_engine
|
||||
|
||||
# Apply sparse policy if enabled (Quest does Top-K selection for decode)
|
||||
sparse_policy = kvcache_manager.sparse_policy
|
||||
if sparse_policy is not None:
|
||||
@@ -530,11 +287,9 @@ class Attention(nn.Module):
|
||||
total_kv_len=len(cpu_block_table) * kvcache_manager.block_size,
|
||||
)
|
||||
cpu_block_table = sparse_policy.select_blocks(
|
||||
cpu_block_table, policy_ctx
|
||||
cpu_block_table, offload_engine, policy_ctx
|
||||
)
|
||||
|
||||
offload_engine = kvcache_manager.offload_engine
|
||||
|
||||
# Use cross-layer pipeline if active (initialized in model_runner)
|
||||
if offload_engine.is_pipeline_active():
|
||||
o_acc, lse_acc = self._decode_with_layer_pipeline(
|
||||
|
||||
Reference in New Issue
Block a user