Merge branch 'zijie/layer-prefill-1' into tzj/vs_offload
Adds MInference sparse attention support: - New MInference sparse policy implementation - A-shape, vertical-slash, and block-sparse patterns - Updated bench.py with sparse attention options - test_minference_gpu.py validation test 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -9,6 +9,7 @@ class SparsePolicyType(Enum):
|
||||
"""Sparse attention policy types."""
|
||||
FULL = auto() # No sparse attention (load all blocks)
|
||||
QUEST = auto() # Query-aware Top-K block selection (decode only)
|
||||
MINFERENCE = auto() # MInference vertical + slash sparse prefill (GPU-only)
|
||||
|
||||
|
||||
@dataclass
|
||||
@@ -39,10 +40,18 @@ class Config:
|
||||
# Sparse attention configuration
|
||||
# Quest: decode-only sparse attention with Top-K block selection
|
||||
# FULL: no sparse attention (load all blocks)
|
||||
# MINFERENCE: MInference vertical + slash sparse prefill (GPU-only)
|
||||
sparse_policy: SparsePolicyType = SparsePolicyType.FULL
|
||||
sparse_topk_blocks: int = 8 # Top-K blocks for Quest
|
||||
sparse_threshold_blocks: int = 4 # Apply sparse only when blocks > threshold
|
||||
|
||||
# MInference configuration (used when sparse_policy == MINFERENCE)
|
||||
minference_adaptive_budget: float = 0.3 # Budget as fraction of seq_len (None to use fixed sizes)
|
||||
minference_vertical_size: int = 1000 # Fixed vertical size (if adaptive_budget is None)
|
||||
minference_slash_size: int = 6096 # Fixed slash size (if adaptive_budget is None)
|
||||
minference_num_sink_tokens: int = 30 # Sink tokens to always keep
|
||||
minference_num_recent_diags: int = 100 # Recent diagonals to always keep
|
||||
|
||||
def __post_init__(self):
|
||||
assert os.path.isdir(self.model)
|
||||
assert self.kvcache_block_size % 256 == 0
|
||||
|
||||
@@ -4,7 +4,7 @@ import torch.distributed as dist
|
||||
from multiprocessing.synchronize import Event
|
||||
from multiprocessing.shared_memory import SharedMemory
|
||||
|
||||
from nanovllm.config import Config
|
||||
from nanovllm.config import Config, SparsePolicyType
|
||||
from nanovllm.engine.sequence import Sequence
|
||||
from nanovllm.models.qwen3 import Qwen3ForCausalLM
|
||||
from nanovllm.layers.sampler import GreedySampler
|
||||
@@ -35,7 +35,10 @@ class ModelRunner:
|
||||
self.model = Qwen3ForCausalLM(hf_config)
|
||||
load_model(self.model, config.model)
|
||||
self.sampler = GreedySampler()
|
||||
|
||||
|
||||
# Initialize sparse_prefill_policy before warmup (will be configured in allocate_kv_cache)
|
||||
self.sparse_prefill_policy = None
|
||||
|
||||
#> Disable warmup for debugging
|
||||
self.warmup_model()
|
||||
|
||||
@@ -148,6 +151,24 @@ class ModelRunner:
|
||||
# Create KV cache manager using factory
|
||||
self.kvcache_manager: KVCacheManager = create_kvcache_manager(config)
|
||||
|
||||
# Create sparse prefill policy for GPU-only path
|
||||
# This is separate from CPU offload sparse policy (which uses select_blocks)
|
||||
self.sparse_prefill_policy = None
|
||||
if not config.enable_cpu_offload and config.sparse_policy != SparsePolicyType.FULL:
|
||||
from nanovllm.kvcache.sparse import create_sparse_policy
|
||||
policy = create_sparse_policy(
|
||||
config.sparse_policy,
|
||||
vertical_size=config.minference_vertical_size,
|
||||
slash_size=config.minference_slash_size,
|
||||
adaptive_budget=config.minference_adaptive_budget,
|
||||
num_sink_tokens=config.minference_num_sink_tokens,
|
||||
num_recent_diags=config.minference_num_recent_diags,
|
||||
)
|
||||
# Only use if policy supports sparse prefill
|
||||
if policy.supports_prefill:
|
||||
self.sparse_prefill_policy = policy
|
||||
logger.info(f"Sparse prefill policy enabled: {self.sparse_prefill_policy}")
|
||||
|
||||
# Allocate cache through manager
|
||||
self.kvcache_manager.allocate_cache(
|
||||
num_layers=hf_config.num_hidden_layers,
|
||||
@@ -329,7 +350,10 @@ class ModelRunner:
|
||||
cu_seqlens_q = torch.tensor(cu_seqlens_q, dtype=torch.int32, pin_memory=True).cuda(non_blocking=True)
|
||||
cu_seqlens_k = torch.tensor(cu_seqlens_k, dtype=torch.int32, pin_memory=True).cuda(non_blocking=True)
|
||||
slot_mapping = torch.tensor(slot_mapping, dtype=torch.int32, pin_memory=True).cuda(non_blocking=True)
|
||||
set_context(True, cu_seqlens_q, cu_seqlens_k, max_seqlen_q, max_seqlen_k, slot_mapping, None, block_tables)
|
||||
|
||||
set_context(True, cu_seqlens_q, cu_seqlens_k, max_seqlen_q, max_seqlen_k,
|
||||
slot_mapping, None, block_tables,
|
||||
sparse_prefill_policy=self.sparse_prefill_policy)
|
||||
return input_ids, positions
|
||||
|
||||
def prepare_decode(self, seqs: list[Sequence]):
|
||||
|
||||
@@ -23,6 +23,7 @@ from nanovllm.config import SparsePolicyType
|
||||
from nanovllm.kvcache.sparse.policy import SparsePolicy, PolicyContext
|
||||
from nanovllm.kvcache.sparse.full_policy import FullAttentionPolicy
|
||||
from nanovllm.kvcache.sparse.quest import QuestPolicy, QuestConfig, BlockMetadataManager
|
||||
from nanovllm.kvcache.sparse.minference import MInferencePolicy
|
||||
|
||||
|
||||
def create_sparse_policy(policy_type: SparsePolicyType, **kwargs) -> SparsePolicy:
|
||||
@@ -55,6 +56,15 @@ def create_sparse_policy(policy_type: SparsePolicyType, **kwargs) -> SparsePolic
|
||||
)
|
||||
return QuestPolicy(config)
|
||||
|
||||
elif policy_type == SparsePolicyType.MINFERENCE:
|
||||
return MInferencePolicy(
|
||||
vertical_size=kwargs.get("vertical_size", 1000),
|
||||
slash_size=kwargs.get("slash_size", 6096),
|
||||
adaptive_budget=kwargs.get("adaptive_budget", 0.3),
|
||||
num_sink_tokens=kwargs.get("num_sink_tokens", 30),
|
||||
num_recent_diags=kwargs.get("num_recent_diags", 100),
|
||||
)
|
||||
|
||||
else:
|
||||
raise ValueError(f"Unknown policy type: {policy_type}")
|
||||
|
||||
@@ -67,5 +77,6 @@ __all__ = [
|
||||
"QuestPolicy",
|
||||
"QuestConfig",
|
||||
"BlockMetadataManager",
|
||||
"MInferencePolicy",
|
||||
"create_sparse_policy",
|
||||
]
|
||||
|
||||
353
nanovllm/kvcache/sparse/minference.py
Normal file
353
nanovllm/kvcache/sparse/minference.py
Normal file
@@ -0,0 +1,353 @@
|
||||
"""
|
||||
MInference sparse attention policy.
|
||||
|
||||
Implements vertical + slash sparse pattern estimation using the last 64 query tokens.
|
||||
Reference: MInference paper (https://arxiv.org/abs/2407.02490)
|
||||
"""
|
||||
|
||||
import math
|
||||
from typing import List, Tuple, Optional
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
|
||||
from nanovllm.kvcache.sparse.policy import SparsePolicy, PolicyContext
|
||||
|
||||
|
||||
class MInferencePolicy(SparsePolicy):
|
||||
"""
|
||||
MInference sparse prefill policy using vertical + slash pattern.
|
||||
|
||||
This policy estimates sparse attention patterns by analyzing attention
|
||||
scores from the last 64 query tokens, then selects:
|
||||
- Vertical: Key positions that are important across all queries
|
||||
- Slash: Diagonal bands (local context)
|
||||
|
||||
The estimated pattern is then used to compute sparse attention.
|
||||
|
||||
Note: This policy is designed for GPU-only prefill. For CPU offload,
|
||||
the pattern estimation and sparse attention will be handled differently.
|
||||
"""
|
||||
|
||||
supports_prefill = True
|
||||
supports_decode = False # MInference is prefill-only sparse strategy
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vertical_size: int = 1000,
|
||||
slash_size: int = 6096,
|
||||
adaptive_budget: Optional[float] = 0.3,
|
||||
num_sink_tokens: int = 30,
|
||||
num_recent_diags: int = 100,
|
||||
):
|
||||
"""
|
||||
Initialize MInference policy.
|
||||
|
||||
Args:
|
||||
vertical_size: Number of vertical (column) positions to keep
|
||||
slash_size: Number of diagonal bands to keep
|
||||
adaptive_budget: If set, compute budget as fraction of seq_len
|
||||
(overrides vertical_size and slash_size)
|
||||
num_sink_tokens: Number of initial sink tokens to always keep
|
||||
num_recent_diags: Number of recent diagonals to always keep
|
||||
"""
|
||||
self.vertical_size = vertical_size
|
||||
self.slash_size = slash_size
|
||||
self.adaptive_budget = adaptive_budget
|
||||
self.num_sink_tokens = num_sink_tokens
|
||||
self.num_recent_diags = num_recent_diags
|
||||
|
||||
# Cache for last-q causal mask
|
||||
self._last_q_mask_cache: dict = {}
|
||||
|
||||
def _get_causal_mask(self, last_q: int, seq_len: int, device: torch.device) -> torch.Tensor:
|
||||
"""Get causal mask for last-q attention."""
|
||||
cache_key = (last_q, seq_len, device)
|
||||
if cache_key not in self._last_q_mask_cache:
|
||||
# Create mask where last_q queries can attend to all previous positions
|
||||
# Shape: [last_q, seq_len]
|
||||
mask = torch.ones(last_q, seq_len, device=device, dtype=torch.bool)
|
||||
# Apply causal constraint for the last last_q positions
|
||||
# Query i (from last_q) can only attend to positions <= (seq_len - last_q + i)
|
||||
for i in range(last_q):
|
||||
mask[i, seq_len - last_q + i + 1:] = False
|
||||
self._last_q_mask_cache[cache_key] = mask
|
||||
return self._last_q_mask_cache[cache_key]
|
||||
|
||||
def estimate_pattern(
|
||||
self,
|
||||
q: torch.Tensor,
|
||||
k: torch.Tensor,
|
||||
layer_id: int,
|
||||
) -> Tuple[torch.Tensor, torch.Tensor]:
|
||||
"""
|
||||
Estimate vertical + slash sparse pattern using last 64 query tokens.
|
||||
Memory-optimized for long sequences (64K+).
|
||||
|
||||
Args:
|
||||
q: Query tensor [seq_len, num_heads, head_dim]
|
||||
k: Key tensor [seq_len, num_kv_heads, head_dim]
|
||||
layer_id: Current layer index (for potential layer-specific patterns)
|
||||
|
||||
Returns:
|
||||
Tuple of (vertical_indices, slash_indices):
|
||||
- vertical_indices: [num_heads, vertical_size] - important K positions
|
||||
- slash_indices: [num_heads, slash_size] - diagonal offsets
|
||||
"""
|
||||
seq_len = q.shape[0]
|
||||
num_heads = q.shape[1]
|
||||
head_dim = q.shape[2]
|
||||
num_kv_heads = k.shape[1]
|
||||
|
||||
# Adaptive budget
|
||||
if self.adaptive_budget is not None:
|
||||
budget = int(seq_len * self.adaptive_budget)
|
||||
vertical_size = max(self.num_sink_tokens + 1, int(budget * 0.2))
|
||||
slash_size = max(self.num_recent_diags + 1, int(budget * 0.8))
|
||||
else:
|
||||
vertical_size = self.vertical_size
|
||||
slash_size = self.slash_size
|
||||
|
||||
# Use last 64 Q tokens for estimation
|
||||
last_q = min(64, seq_len)
|
||||
q_last = q[-last_q:] # [last_q, heads, dim] - this is a view, not a copy
|
||||
|
||||
# Handle GQA: if num_kv_heads < num_heads, we need to expand K
|
||||
if num_kv_heads < num_heads:
|
||||
num_groups = num_heads // num_kv_heads
|
||||
k_work = k.repeat_interleave(num_groups, dim=1)
|
||||
else:
|
||||
k_work = k
|
||||
|
||||
# Compute attention scores: [heads, last_q, seq_len]
|
||||
scale = 1.0 / math.sqrt(head_dim)
|
||||
qk = torch.einsum('qhd,khd->hqk', q_last, k_work) * scale
|
||||
|
||||
# Free k_work if it was a copy
|
||||
if num_kv_heads < num_heads:
|
||||
del k_work
|
||||
|
||||
# Apply causal mask for last positions (in-place)
|
||||
causal_mask = self._get_causal_mask(last_q, seq_len, q.device)
|
||||
qk.masked_fill_(~causal_mask.unsqueeze(0), float('-inf'))
|
||||
|
||||
# Softmax (in-place where possible)
|
||||
qk = F.softmax(qk, dim=-1, dtype=torch.float32)
|
||||
|
||||
# === Vertical pattern ===
|
||||
# Sum across query dimension -> importance of each K position
|
||||
vertical_scores = qk.sum(dim=1) # [heads, seq_len]
|
||||
|
||||
# Force keep first num_sink_tokens (attention sinks) - in-place
|
||||
vertical_scores[:, :self.num_sink_tokens] = float('inf')
|
||||
|
||||
# Select top-k
|
||||
actual_vertical = min(vertical_size, seq_len)
|
||||
vertical_indices = vertical_scores.topk(actual_vertical, dim=-1).indices
|
||||
vertical_indices = vertical_indices.sort(dim=-1).values
|
||||
del vertical_scores
|
||||
|
||||
# === Slash pattern ===
|
||||
# Create diagonal index matrix: [last_q, seq_len] with int32 to save memory
|
||||
q_indices = torch.arange(last_q, device=q.device, dtype=torch.int32).unsqueeze(1)
|
||||
k_indices = torch.arange(seq_len, device=q.device, dtype=torch.int32).unsqueeze(0)
|
||||
diag_indices = (seq_len - last_q + q_indices) - k_indices # [last_q, seq_len]
|
||||
del q_indices
|
||||
|
||||
# Create causal mask for slash computation
|
||||
q_pos = seq_len - last_q + torch.arange(last_q, device=q.device, dtype=torch.int32).unsqueeze(1)
|
||||
slash_causal_mask = k_indices <= q_pos
|
||||
del q_pos, k_indices
|
||||
|
||||
# Clamp diagonal indices to valid range
|
||||
diag_indices = diag_indices.clamp(0, seq_len - 1)
|
||||
|
||||
# Apply causal mask to qk (in-place) for slash computation
|
||||
qk[:, ~slash_causal_mask] = 0
|
||||
del slash_causal_mask
|
||||
|
||||
# Accumulate scores per diagonal - process in batches to save memory
|
||||
slash_scores = torch.zeros(num_heads, seq_len, device=q.device, dtype=torch.float32)
|
||||
|
||||
# Process heads in chunks to reduce peak memory for diag_indices_expanded
|
||||
chunk_size = min(8, num_heads) # Process 8 heads at a time
|
||||
for h_start in range(0, num_heads, chunk_size):
|
||||
h_end = min(h_start + chunk_size, num_heads)
|
||||
n_heads_chunk = h_end - h_start
|
||||
|
||||
# Expand diag_indices only for this chunk
|
||||
diag_chunk = diag_indices.unsqueeze(0).expand(n_heads_chunk, -1, -1).long()
|
||||
qk_chunk = qk[h_start:h_end]
|
||||
|
||||
slash_scores[h_start:h_end].scatter_add_(
|
||||
1,
|
||||
diag_chunk.reshape(n_heads_chunk, -1),
|
||||
qk_chunk.reshape(n_heads_chunk, -1)
|
||||
)
|
||||
del diag_chunk, qk_chunk
|
||||
|
||||
del diag_indices, qk
|
||||
|
||||
# Force keep first num_recent_diags (in-place)
|
||||
slash_scores[:, :self.num_recent_diags] = float('inf')
|
||||
|
||||
# Select top-k diagonal indices
|
||||
actual_slash = min(slash_size, seq_len)
|
||||
slash_indices = slash_scores.topk(actual_slash, dim=-1).indices
|
||||
slash_indices = slash_indices.sort(dim=-1).values
|
||||
del slash_scores
|
||||
|
||||
return vertical_indices, slash_indices
|
||||
|
||||
def select_blocks(
|
||||
self,
|
||||
available_blocks: List[int],
|
||||
ctx: PolicyContext,
|
||||
) -> List[int]:
|
||||
"""
|
||||
Select blocks for chunked CPU offload mode.
|
||||
|
||||
For MInference in GPU-only mode, this method is not used.
|
||||
In CPU offload mode, it would select blocks based on the sparse pattern.
|
||||
|
||||
For now, return all blocks (full attention fallback).
|
||||
"""
|
||||
# MInference pattern is computed in attention.forward()
|
||||
# For CPU offload integration (Phase B), this would use the pattern
|
||||
return available_blocks
|
||||
|
||||
def reset(self) -> None:
|
||||
"""Reset policy state."""
|
||||
self._last_q_mask_cache.clear()
|
||||
|
||||
def sparse_prefill_attention(
|
||||
self,
|
||||
q: torch.Tensor,
|
||||
k: torch.Tensor,
|
||||
v: torch.Tensor,
|
||||
layer_id: int,
|
||||
) -> torch.Tensor:
|
||||
"""
|
||||
Compute MInference sparse attention for prefill.
|
||||
|
||||
Uses vertical + slash pattern to compute sparse attention efficiently.
|
||||
Memory-optimized to handle long sequences (64K+) by freeing intermediate tensors.
|
||||
|
||||
Args:
|
||||
q: Query tensor [seq_len, num_heads, head_dim]
|
||||
k: Key tensor [seq_len, num_kv_heads, head_dim]
|
||||
v: Value tensor [seq_len, num_kv_heads, head_dim]
|
||||
layer_id: Current transformer layer index
|
||||
|
||||
Returns:
|
||||
Attention output [seq_len, num_heads, head_dim]
|
||||
"""
|
||||
from minference.ops.pit_sparse_flash_attention_v2 import _triton_mixed_sparse_attention
|
||||
from minference.cuda import convert_vertical_slash_indexes
|
||||
|
||||
seq_len = q.shape[0]
|
||||
num_heads = q.shape[1]
|
||||
head_dim = q.shape[2]
|
||||
num_kv_heads = k.shape[1]
|
||||
|
||||
# Estimate sparse pattern (uses temporary memory for qk scores)
|
||||
vertical_indices, slash_indices = self.estimate_pattern(q, k, layer_id)
|
||||
# Free any cached memory from pattern estimation
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
# Triton sparse attention kernel parameters
|
||||
block_size_M = 64
|
||||
block_size_N = 64
|
||||
|
||||
# Calculate padding
|
||||
pad = (block_size_M - seq_len) & (block_size_M - 1)
|
||||
need_head_pad = head_dim not in [16, 32, 64, 128, 256, 512]
|
||||
head_pad = (2 ** math.ceil(math.log2(head_dim)) - head_dim) if need_head_pad else 0
|
||||
|
||||
# Handle GQA: expand K/V to match query heads
|
||||
# Do this BEFORE creating batched tensors to avoid double copies
|
||||
if num_kv_heads < num_heads:
|
||||
num_groups = num_heads // num_kv_heads
|
||||
# Use repeat_interleave for memory-efficient expansion
|
||||
k_work = k.repeat_interleave(num_groups, dim=1)
|
||||
v_work = v.repeat_interleave(num_groups, dim=1)
|
||||
else:
|
||||
k_work = k
|
||||
v_work = v
|
||||
|
||||
# Transform Q to [batch, heads, seq, dim] format with padding in one step
|
||||
# This avoids creating intermediate copies
|
||||
if pad > 0 or head_pad > 0:
|
||||
q_batched = torch.nn.functional.pad(
|
||||
q.unsqueeze(0).transpose(1, 2),
|
||||
[0, head_pad, 0, pad, 0, 0, 0, 0]
|
||||
).contiguous()
|
||||
else:
|
||||
q_batched = q.unsqueeze(0).transpose(1, 2).contiguous()
|
||||
|
||||
# Transform K to batched format
|
||||
if pad > 0 or head_pad > 0:
|
||||
k_batched = torch.nn.functional.pad(
|
||||
k_work.unsqueeze(0).transpose(1, 2),
|
||||
[0, head_pad, 0, pad, 0, 0, 0, 0]
|
||||
).contiguous()
|
||||
else:
|
||||
k_batched = k_work.unsqueeze(0).transpose(1, 2).contiguous()
|
||||
|
||||
# Free k_work if it was a copy (GQA case)
|
||||
if num_kv_heads < num_heads:
|
||||
del k_work
|
||||
|
||||
# Transform V to batched format
|
||||
if pad > 0 or head_pad > 0:
|
||||
v_batched = torch.nn.functional.pad(
|
||||
v_work.unsqueeze(0).transpose(1, 2),
|
||||
[0, head_pad, 0, pad, 0, 0, 0, 0]
|
||||
).contiguous()
|
||||
else:
|
||||
v_batched = v_work.unsqueeze(0).transpose(1, 2).contiguous()
|
||||
|
||||
# Free v_work if it was a copy (GQA case)
|
||||
if num_kv_heads < num_heads:
|
||||
del v_work
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
# Prepare indices for Triton kernel
|
||||
v_idx = vertical_indices.to(torch.int32).reshape((1, num_heads, -1))
|
||||
v_idx = v_idx.sort(dim=-1, descending=False)[0].contiguous()
|
||||
del vertical_indices
|
||||
|
||||
s_idx = slash_indices.to(torch.int32).reshape((1, num_heads, -1))
|
||||
s_idx = s_idx.sort(dim=-1, descending=True)[0].contiguous()
|
||||
del slash_indices
|
||||
|
||||
seqlens = torch.tensor([seq_len], dtype=torch.int32, device=q.device)
|
||||
sm_scale = head_dim ** -0.5
|
||||
|
||||
# Convert vertical+slash indices to block sparse format
|
||||
block_count, block_offset, column_count, column_index = convert_vertical_slash_indexes(
|
||||
seqlens, v_idx, s_idx, seq_len, block_size_M, block_size_N,
|
||||
)
|
||||
del v_idx, s_idx
|
||||
|
||||
# Call Triton mixed sparse attention kernel
|
||||
o = _triton_mixed_sparse_attention(
|
||||
q_batched, k_batched, v_batched, seqlens,
|
||||
block_count, block_offset, column_count, column_index,
|
||||
sm_scale, block_size_M, block_size_N,
|
||||
)
|
||||
|
||||
# Free input tensors immediately after kernel call
|
||||
del q_batched, k_batched, v_batched
|
||||
del block_count, block_offset, column_count, column_index
|
||||
|
||||
# Remove padding and convert back to [seq_len, num_heads, head_dim]
|
||||
o = o[..., :seq_len, :head_dim]
|
||||
o = o.transpose(1, 2).squeeze(0).contiguous()
|
||||
|
||||
return o
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return (f"MInferencePolicy("
|
||||
f"adaptive_budget={self.adaptive_budget}, "
|
||||
f"vertical_size={self.vertical_size}, "
|
||||
f"slash_size={self.slash_size})")
|
||||
@@ -183,5 +183,32 @@ class SparsePolicy(ABC):
|
||||
"""
|
||||
pass
|
||||
|
||||
def sparse_prefill_attention(
|
||||
self,
|
||||
q: torch.Tensor,
|
||||
k: torch.Tensor,
|
||||
v: torch.Tensor,
|
||||
layer_id: int,
|
||||
) -> torch.Tensor:
|
||||
"""
|
||||
Compute sparse attention for prefill phase.
|
||||
|
||||
This method is called when supports_prefill=True and the policy
|
||||
is used for GPU-only sparse prefill (no CPU offload).
|
||||
|
||||
Args:
|
||||
q: Query tensor [seq_len, num_heads, head_dim]
|
||||
k: Key tensor [seq_len, num_kv_heads, head_dim]
|
||||
v: Value tensor [seq_len, num_kv_heads, head_dim]
|
||||
layer_id: Current transformer layer index
|
||||
|
||||
Returns:
|
||||
Attention output [seq_len, num_heads, head_dim]
|
||||
"""
|
||||
raise NotImplementedError(
|
||||
f"{self.__class__.__name__} does not implement sparse_prefill_attention. "
|
||||
"Set supports_prefill=False or implement this method."
|
||||
)
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f"{self.__class__.__name__}()"
|
||||
|
||||
@@ -140,6 +140,11 @@ class Attention(nn.Module):
|
||||
max_seqlen_q=context.max_seqlen_q, cu_seqlens_q=context.cu_seqlens_q,
|
||||
max_seqlen_k=context.max_seqlen_k, cu_seqlens_k=context.cu_seqlens_k,
|
||||
softmax_scale=self.scale, causal=True, block_table=context.block_tables)
|
||||
elif context.sparse_prefill_policy is not None:
|
||||
# Sparse prefill (GPU-only) - delegate to policy
|
||||
o = context.sparse_prefill_policy.sparse_prefill_attention(
|
||||
q, k, v, self.layer_id
|
||||
)
|
||||
else:
|
||||
o = flash_attn_varlen_func(q, k, v,
|
||||
max_seqlen_q=context.max_seqlen_q, cu_seqlens_q=context.cu_seqlens_q,
|
||||
|
||||
@@ -35,6 +35,10 @@ class Context:
|
||||
# Current chunk index for ring buffer pipeline (prefill only)
|
||||
current_chunk_idx: int = 0
|
||||
|
||||
# Sparse prefill attention support (GPU-only path)
|
||||
# When set, uses policy.sparse_prefill_attention() instead of FlashAttention
|
||||
sparse_prefill_policy: Any = None # SparsePolicy instance with supports_prefill=True
|
||||
|
||||
|
||||
_CONTEXT = Context()
|
||||
|
||||
@@ -60,6 +64,7 @@ def set_context(
|
||||
decode_pos_in_block=0,
|
||||
decode_start_pos_in_block=0,
|
||||
current_chunk_idx=0,
|
||||
sparse_prefill_policy=None,
|
||||
):
|
||||
global _CONTEXT
|
||||
_CONTEXT = Context(
|
||||
@@ -79,6 +84,7 @@ def set_context(
|
||||
decode_pos_in_block=decode_pos_in_block,
|
||||
decode_start_pos_in_block=decode_start_pos_in_block,
|
||||
current_chunk_idx=current_chunk_idx,
|
||||
sparse_prefill_policy=sparse_prefill_policy,
|
||||
)
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user