[refactor] Refactor current gpu and cpu block allocation strategy.

This commit is contained in:
Zijie Tian
2025-12-10 21:23:31 +08:00
parent 0a247ccb1b
commit 190df5f70d
7 changed files with 906 additions and 162 deletions

View File

@@ -40,7 +40,6 @@ def main():
max_model_len=128 * 1024, max_model_len=128 * 1024,
max_num_batched_tokens=128 * 1024, max_num_batched_tokens=128 * 1024,
enable_cpu_offload=True, enable_cpu_offload=True,
cpu_memory_gb=32.0,
) )
# Warmup # Warmup

View File

@@ -19,7 +19,6 @@ class Config:
# CPU Offload configuration # CPU Offload configuration
enable_cpu_offload: bool = False enable_cpu_offload: bool = False
cpu_memory_gb: float = 16.0 # CPU memory limit for KV cache
offload_policy: str = "lru" # "lru", "fifo", or full class path offload_policy: str = "lru" # "lru", "fifo", or full class path
num_transfer_streams: int = 4 # Number of CUDA streams for async transfers num_transfer_streams: int = 4 # Number of CUDA streams for async transfers
num_gpu_blocks: int = -1 # User-specified GPU blocks count, -1 = auto (use max available) num_gpu_blocks: int = -1 # User-specified GPU blocks count, -1 = auto (use max available)

View File

@@ -10,8 +10,11 @@ from nanovllm.models.qwen3 import Qwen3ForCausalLM
from nanovllm.layers.sampler import Sampler from nanovllm.layers.sampler import Sampler
from nanovllm.utils.context import set_context, get_context, reset_context from nanovllm.utils.context import set_context, get_context, reset_context
from nanovllm.utils.loader import load_model from nanovllm.utils.loader import load_model
from nanovllm.utils.logger import get_logger
from nanovllm.kvcache import create_kvcache_manager, KVCacheManager from nanovllm.kvcache import create_kvcache_manager, KVCacheManager
logger = get_logger("model_runner")
class ModelRunner: class ModelRunner:
@@ -120,9 +123,11 @@ class ModelRunner:
num_gpu_blocks = max_gpu_blocks num_gpu_blocks = max_gpu_blocks
if config.enable_cpu_offload: if config.enable_cpu_offload:
# Calculate CPU blocks based on cpu_memory_gb # Ping-Pong设计CPU是主存储GPU是工作缓冲区
cpu_bytes = int(config.cpu_memory_gb * 1024**3) # CPU blocks = 支持max_model_len所需的全部blocks存储一个最大序列的完整KV
num_cpu_blocks = cpu_bytes // block_bytes # GPU blocks = Ping-Pong工作缓冲区用户指定或自动
num_cpu_blocks = (config.max_model_len + self.block_size - 1) // self.block_size
config.num_gpu_kvcache_blocks = num_gpu_blocks config.num_gpu_kvcache_blocks = num_gpu_blocks
config.num_cpu_kvcache_blocks = num_cpu_blocks config.num_cpu_kvcache_blocks = num_cpu_blocks
# For backward compatibility # For backward compatibility
@@ -143,6 +148,27 @@ class ModelRunner:
dtype=hf_config.torch_dtype, dtype=hf_config.torch_dtype,
) )
# Log KV cache allocation info
gpu_memory_mb = config.num_gpu_kvcache_blocks * block_bytes / (1024 ** 2)
cpu_memory_mb = config.num_cpu_kvcache_blocks * block_bytes / (1024 ** 2)
total_memory_mb = gpu_memory_mb + cpu_memory_mb
if config.enable_cpu_offload:
logger.info(
f"KV Cache allocated (Ping-Pong mode): "
f"GPU={config.num_gpu_kvcache_blocks} blocks ({gpu_memory_mb:.1f}MB), "
f"CPU={config.num_cpu_kvcache_blocks} blocks ({cpu_memory_mb:.1f}MB), "
f"Total={total_memory_mb:.1f}MB, "
f"block_size={self.block_size}, "
f"ping_size={config.num_gpu_kvcache_blocks // 2}"
)
else:
logger.info(
f"KV Cache allocated: "
f"GPU={config.num_gpu_kvcache_blocks} blocks ({gpu_memory_mb:.1f}MB), "
f"block_size={self.block_size}"
)
# Bind layer caches to attention modules and set layer_id # Bind layer caches to attention modules and set layer_id
layer_id = 0 layer_id = 0
for module in self.model.modules(): for module in self.model.modules():
@@ -328,7 +354,16 @@ class ModelRunner:
return self.model.compute_logits(graph_vars["outputs"][:bs]) return self.model.compute_logits(graph_vars["outputs"][:bs])
def run(self, seqs: list[Sequence], is_prefill: bool) -> list[int]: def run(self, seqs: list[Sequence], is_prefill: bool) -> list[int]:
# Check if chunked prefill is needed # Check if Ping-Pong mode should be used (all blocks on CPU)
if hasattr(self, 'kvcache_manager') and hasattr(self.kvcache_manager, 'get_all_cpu_blocks'):
use_pingpong = self._should_use_pingpong(seqs, is_prefill)
if use_pingpong:
if is_prefill:
return self.run_pingpong_prefill(seqs)
else:
return self.run_pingpong_decode(seqs)
# Check if chunked prefill is needed (legacy path)
if is_prefill and hasattr(self, 'kvcache_manager'): if is_prefill and hasattr(self, 'kvcache_manager'):
needs_chunked = any( needs_chunked = any(
hasattr(self.kvcache_manager, 'needs_chunked_prefill') and hasattr(self.kvcache_manager, 'needs_chunked_prefill') and
@@ -338,7 +373,7 @@ class ModelRunner:
if needs_chunked: if needs_chunked:
return self.run_chunked_prefill(seqs) return self.run_chunked_prefill(seqs)
# Check if chunked decode is needed # Check if chunked decode is needed (legacy path)
if not is_prefill and hasattr(self, 'kvcache_manager'): if not is_prefill and hasattr(self, 'kvcache_manager'):
needs_chunked = any( needs_chunked = any(
hasattr(self.kvcache_manager, 'needs_chunked_decode') and hasattr(self.kvcache_manager, 'needs_chunked_decode') and
@@ -355,6 +390,36 @@ class ModelRunner:
reset_context() reset_context()
return token_ids return token_ids
def _should_use_pingpong(self, seqs: list[Sequence], is_prefill: bool) -> bool:
"""
Check if Ping-Pong mode should be used.
Use Ping-Pong when:
- CPU offload is enabled
- There are blocks on CPU (either allocated there or offloaded)
- Sequence exceeds GPU capacity
"""
if not hasattr(self.kvcache_manager, 'offload_engine'):
return False
for seq in seqs:
if not seq.block_table:
continue # Skip warmup sequences
# Check if any blocks are on CPU
cpu_blocks, _ = self.kvcache_manager.get_all_cpu_blocks(seq)
if cpu_blocks:
# Has CPU blocks - use Ping-Pong
return True
# Check if sequence needs more blocks than GPU can hold
ping_size = self.kvcache_manager.offload_engine.ping_size
if seq.num_blocks > ping_size:
# Needs chunked processing
return True
return False
def run_chunked_prefill(self, seqs: list[Sequence]) -> list[int]: def run_chunked_prefill(self, seqs: list[Sequence]) -> list[int]:
""" """
Run prefill in chunks when sequences exceed GPU capacity. Run prefill in chunks when sequences exceed GPU capacity.
@@ -543,6 +608,210 @@ class ModelRunner:
return input_ids, positions return input_ids, positions
def run_pingpong_prefill(self, seqs: list[Sequence]) -> list[int]:
"""
Run prefill with Ping-Pong dual buffer (CPU is primary storage).
Flow:
1. All blocks are allocated to CPU (primary storage)
2. Process tokens in chunks using Ping/Pong GPU buffers
3. After each chunk, offload from GPU to CPU
4. Alternate between Ping and Pong buffers
"""
import sys
assert len(seqs) == 1, "Ping-Pong prefill only supports single sequence"
seq = seqs[0]
offload_engine = self.kvcache_manager.offload_engine
ping_size = offload_engine.ping_size
tokens_per_chunk = ping_size * self.block_size
total_tokens = len(seq)
print(f"[Ping-Pong Prefill] Starting: {total_tokens} tokens, "
f"ping_size={ping_size} blocks, chunk={tokens_per_chunk} tokens",
file=sys.stderr)
current_buffer = "ping"
chunk_num = 0
logits = None
processed_tokens = 0
# Get CPU block table for offload targets
cpu_block_ids, logical_ids = self.kvcache_manager.get_all_cpu_blocks(seq)
while processed_tokens < total_tokens:
chunk_num += 1
chunk_start = processed_tokens
chunk_end = min(processed_tokens + tokens_per_chunk, total_tokens)
chunk_tokens = chunk_end - chunk_start
# Calculate which CPU blocks this chunk covers
start_block_idx = chunk_start // self.block_size
end_block_idx = (chunk_end + self.block_size - 1) // self.block_size
num_blocks = end_block_idx - start_block_idx
print(f"[Ping-Pong Prefill] Chunk {chunk_num}: tokens {chunk_start}-{chunk_end}, "
f"blocks {start_block_idx}-{end_block_idx-1}, buffer={current_buffer}",
file=sys.stderr)
# Get GPU slots for this chunk (Ping or Pong buffer)
if current_buffer == "ping":
gpu_slots = offload_engine.ping_slots[:num_blocks]
else:
gpu_slots = offload_engine.pong_slots[:num_blocks]
# Prepare inputs
input_ids, positions = self._prepare_pingpong_chunk(
seq, chunk_start, chunk_end, gpu_slots, start_block_idx
)
if input_ids.numel() == 0:
break
# Run model forward
logits = self.run_model(input_ids, positions, is_prefill=True)
reset_context()
# Mark blocks as prefilled
for i in range(start_block_idx, min(end_block_idx, len(seq.block_table))):
logical_id = seq.block_table[i]
self.kvcache_manager.prefilled_blocks.add(logical_id)
# Offload this chunk from GPU to CPU (async)
chunk_cpu_blocks = cpu_block_ids[start_block_idx:end_block_idx]
offload_engine.offload_buffer_to_cpu(current_buffer, chunk_cpu_blocks)
# Switch buffer for next chunk
if current_buffer == "ping":
offload_engine.wait_ping_offload_done()
current_buffer = "pong"
else:
offload_engine.wait_pong_offload_done()
current_buffer = "ping"
processed_tokens = chunk_end
# Wait for all offloads to complete
offload_engine.wait_all_offload_done()
print(f"[Ping-Pong Prefill] Complete: {chunk_num} chunks", file=sys.stderr)
# Sample from last logits
temperatures = self.prepare_sample(seqs) if self.rank == 0 else None
if logits is not None:
token_ids = self.sampler(logits, temperatures).tolist() if self.rank == 0 else None
else:
token_ids = [0] if self.rank == 0 else None
return token_ids
def _prepare_pingpong_chunk(
self,
seq: Sequence,
chunk_start: int,
chunk_end: int,
gpu_slots: list[int],
start_block_idx: int,
) -> tuple[torch.Tensor, torch.Tensor]:
"""Prepare inputs for a Ping-Pong prefill chunk."""
# Input tokens for this chunk
input_ids = seq[chunk_start:chunk_end]
positions = list(range(chunk_start, chunk_end))
# Create slot mapping pointing to GPU slots
slot_mapping = []
num_tokens = chunk_end - chunk_start
token_idx = 0
for i, gpu_slot in enumerate(gpu_slots):
block_idx = start_block_idx + i
block_start = block_idx * self.block_size
block_end = min(block_start + self.block_size, len(seq))
# How many tokens in this block for this chunk
overlap_start = max(chunk_start, block_start)
overlap_end = min(chunk_end, block_end)
for pos in range(overlap_start, overlap_end):
pos_in_block = pos % self.block_size
slot = gpu_slot * self.block_size + pos_in_block
slot_mapping.append(slot)
# Convert to tensors
input_ids = torch.tensor(input_ids, dtype=torch.int64, pin_memory=True).cuda(non_blocking=True)
positions = torch.tensor(positions, dtype=torch.int64, pin_memory=True).cuda(non_blocking=True)
slot_mapping = torch.tensor(slot_mapping, dtype=torch.int32, pin_memory=True).cuda(non_blocking=True)
# Set up context for chunked prefill
seqlen = num_tokens
cu_seqlens_q = torch.tensor([0, seqlen], dtype=torch.int32, pin_memory=True).cuda(non_blocking=True)
cu_seqlens_k = torch.tensor([0, seqlen], dtype=torch.int32, pin_memory=True).cuda(non_blocking=True)
set_context(
is_prefill=True,
cu_seqlens_q=cu_seqlens_q,
cu_seqlens_k=cu_seqlens_k,
max_seqlen_q=seqlen,
max_seqlen_k=seqlen,
slot_mapping=slot_mapping,
is_chunked_prefill=True,
offload_engine=self.kvcache_manager,
chunked_seq=seq,
)
return input_ids, positions
def run_pingpong_decode(self, seqs: list[Sequence]) -> list[int]:
"""
Run decode with Ping-Pong dual buffer.
All KV is on CPU. Uses Ping-Pong to load KV chunks and compute attention.
New token's KV is written to GPU then offloaded to CPU.
"""
assert len(seqs) == 1, "Ping-Pong decode only supports single sequence"
seq = seqs[0]
# Prepare inputs
input_ids = torch.tensor([seq.last_token], dtype=torch.int64, pin_memory=True).cuda(non_blocking=True)
positions = torch.tensor([len(seq) - 1], dtype=torch.int64, pin_memory=True).cuda(non_blocking=True)
# Get write slot for new KV (will use last slot of the buffer used for final chunk)
write_slot = self.kvcache_manager.get_write_slot_for_pingpong(seq)
# Calculate position in block for slot mapping
last_block_idx = seq.num_blocks - 1
pos_in_block = (len(seq) - 1) % self.block_size
slot = write_slot * self.block_size + pos_in_block
slot_mapping = torch.tensor([slot], dtype=torch.int32, pin_memory=True).cuda(non_blocking=True)
context_len = torch.tensor([len(seq)], dtype=torch.int32, pin_memory=True).cuda(non_blocking=True)
# Set up context for chunked decode
set_context(
is_prefill=False,
slot_mapping=slot_mapping,
context_lens=context_len,
is_chunked_prefill=True, # Use chunked attention path
offload_engine=self.kvcache_manager,
chunked_seq=seq,
)
# Run model forward pass
logits = self.run_model(input_ids, positions, is_prefill=False)
reset_context()
# Offload new KV from write_slot to CPU
last_cpu_block = self.kvcache_manager.get_last_cpu_block(seq)
if last_cpu_block >= 0:
self.kvcache_manager.offload_engine.offload_slot_to_cpu(write_slot, last_cpu_block)
self.kvcache_manager.offload_engine.wait_all_offload_done()
# Sample
temperatures = self.prepare_sample(seqs) if self.rank == 0 else None
token_ids = self.sampler(logits, temperatures).tolist() if self.rank == 0 else None
return token_ids
@torch.inference_mode() @torch.inference_mode()
def capture_cudagraph(self): def capture_cudagraph(self):
config = self.config config = self.config

View File

@@ -81,20 +81,24 @@ class HybridKVCacheManager(KVCacheManager):
num_cpu_blocks: int, num_cpu_blocks: int,
block_size: int, block_size: int,
policy: Optional[EvictionPolicy] = None, policy: Optional[EvictionPolicy] = None,
cpu_primary: bool = True,
): ):
""" """
Initialize hybrid manager. Initialize hybrid manager.
Args: Args:
num_gpu_slots: Number of GPU buffer slots (working set) num_gpu_slots: Number of GPU buffer slots (working set)
num_cpu_blocks: Number of CPU pool blocks (overflow) num_cpu_blocks: Number of CPU pool blocks (overflow or primary storage)
block_size: Tokens per block block_size: Tokens per block
policy: Eviction policy (default: LRU) policy: Eviction policy (default: LRU)
cpu_primary: If True, use CPU as primary storage with Ping-Pong GPU buffer.
If False, use GPU as primary with CPU as overflow (legacy mode).
""" """
self._block_size = block_size self._block_size = block_size
self.num_gpu_slots = num_gpu_slots self.num_gpu_slots = num_gpu_slots
self.num_cpu_blocks = num_cpu_blocks self.num_cpu_blocks = num_cpu_blocks
self.total_blocks = num_gpu_slots + num_cpu_blocks self.total_blocks = num_gpu_slots + num_cpu_blocks
self.cpu_primary = cpu_primary # Ping-Pong mode flag
# Eviction policy # Eviction policy
self.policy = policy or LRUPolicy() self.policy = policy or LRUPolicy()
@@ -321,12 +325,16 @@ class HybridKVCacheManager(KVCacheManager):
""" """
Allocate logical blocks for prefill. Allocate logical blocks for prefill.
New blocks are allocated on GPU when possible. If GPU is full and all In cpu_primary mode (Ping-Pong): All blocks are allocated to CPU.
GPU blocks belong to this sequence (can't evict), remaining blocks In legacy mode: Blocks are allocated to GPU when possible, overflow to CPU.
are allocated to CPU for chunked prefill.
""" """
assert not seq.block_table, "Sequence already has blocks" assert not seq.block_table, "Sequence already has blocks"
# Ping-Pong模式所有blocks都分配到CPU
if self.cpu_primary:
return self.allocate_cpu_only(seq)
# Legacy模式GPU为主CPU为overflow
h = -1 h = -1
cache_miss = False cache_miss = False
@@ -451,11 +459,20 @@ class HybridKVCacheManager(KVCacheManager):
block.hash = -1 block.hash = -1
block.token_ids = [] block.token_ids = []
# New decode blocks go to GPU if self.cpu_primary:
# Ping-Pong模式新block分配到CPU
if not self.free_cpu_blocks:
raise RuntimeError("No free CPU blocks for decode")
cpu_block_id = self.free_cpu_blocks.popleft()
block.location = BlockLocation.CPU
block.cpu_block_id = cpu_block_id
block.gpu_slot = -1
self.cpu_block_to_logical[cpu_block_id] = logical_id
else:
# Legacy模式新block分配到GPU
gpu_slot = self._allocate_gpu_slot() gpu_slot = self._allocate_gpu_slot()
block.location = BlockLocation.GPU block.location = BlockLocation.GPU
block.gpu_slot = gpu_slot block.gpu_slot = gpu_slot
self.gpu_slot_to_logical[gpu_slot] = logical_id self.gpu_slot_to_logical[gpu_slot] = logical_id
self.policy.on_block_allocated(gpu_slot, self.current_step) self.policy.on_block_allocated(gpu_slot, self.current_step)
@@ -993,6 +1010,158 @@ class HybridKVCacheManager(KVCacheManager):
break break
return pos return pos
# ========== Ping-Pong 双缓冲支持 ==========
def allocate_cpu_only(self, seq: Sequence) -> None:
"""
为序列分配 CPU blocks用于 Ping-Pong 模式)。
与 allocate() 不同,这里所有 blocks 都分配到 CPU
GPU 只用作工作缓冲区。
Args:
seq: 要分配的序列
"""
assert not seq.block_table, "Sequence already has blocks"
for i in range(seq.num_blocks):
# 分配 CPU block
if not self.free_cpu_blocks:
raise RuntimeError(
f"No free CPU blocks. Need {seq.num_blocks}, "
f"available: {len(self.free_cpu_blocks)}"
)
cpu_block_id = self.free_cpu_blocks.popleft()
# 分配逻辑块
logical_id = self.free_logical_ids.popleft()
block = self.logical_blocks[logical_id]
block.ref_count = 1
block.location = BlockLocation.CPU
block.cpu_block_id = cpu_block_id
block.gpu_slot = -1
self.cpu_block_to_logical[cpu_block_id] = logical_id
seq.block_table.append(logical_id)
def get_cpu_block_table(self, seq: Sequence) -> List[int]:
"""
获取序列的 CPU block ID 列表。
Args:
seq: 序列
Returns:
CPU block IDs 列表,按序列顺序
"""
cpu_blocks = []
for logical_id in seq.block_table:
block = self.logical_blocks[logical_id]
if block.location == BlockLocation.CPU:
cpu_blocks.append(block.cpu_block_id)
else:
# 如果 block 在 GPU 上,它应该有一个对应的 CPU block
# 在 Ping-Pong 模式下,所有数据最终都在 CPU 上
raise RuntimeError(
f"Block {logical_id} not on CPU (location={block.location}). "
f"In Ping-Pong mode, all blocks should be on CPU."
)
return cpu_blocks
def get_all_cpu_blocks(self, seq: Sequence) -> Tuple[List[int], List[int]]:
"""
获取序列的所有 CPU blocks 及其逻辑 ID。
Args:
seq: 序列
Returns:
(cpu_block_ids, logical_ids)
"""
cpu_block_ids = []
logical_ids = []
for logical_id in seq.block_table:
block = self.logical_blocks[logical_id]
if block.location == BlockLocation.CPU:
cpu_block_ids.append(block.cpu_block_id)
logical_ids.append(logical_id)
return cpu_block_ids, logical_ids
def allocate_next_cpu_block(self, seq: Sequence) -> int:
"""
为序列分配下一个 CPU block用于 decode 时新 token
Args:
seq: 序列
Returns:
新分配的 CPU block ID
"""
if not self.free_cpu_blocks:
raise RuntimeError("No free CPU blocks")
cpu_block_id = self.free_cpu_blocks.popleft()
logical_id = self.free_logical_ids.popleft()
block = self.logical_blocks[logical_id]
block.ref_count = 1
block.location = BlockLocation.CPU
block.cpu_block_id = cpu_block_id
block.gpu_slot = -1
self.cpu_block_to_logical[cpu_block_id] = logical_id
seq.block_table.append(logical_id)
return cpu_block_id
def get_last_cpu_block(self, seq: Sequence) -> int:
"""
获取序列最后一个 block 的 CPU block ID。
如果最后一个 block 不在 CPU 上,返回 -1。
Args:
seq: 序列
Returns:
CPU block ID如果不在 CPU 上则返回 -1
"""
if not seq.block_table:
return -1
last_logical_id = seq.block_table[-1]
block = self.logical_blocks[last_logical_id]
if block.location == BlockLocation.CPU:
return block.cpu_block_id
return -1
def get_write_slot_for_pingpong(self, seq: Sequence) -> int:
"""
获取 Ping-Pong decode 时新 KV 写入的 GPU slot。
策略:使用序列所需 chunks 数决定最后用的是 Ping 还是 Pong buffer
然后使用该 buffer 的最后一个 slot。
Args:
seq: 序列
Returns:
GPU slot ID
"""
cpu_blocks, _ = self.get_all_cpu_blocks(seq)
ping_size = self.offload_engine.ping_size
num_chunks = (len(cpu_blocks) + ping_size - 1) // ping_size if cpu_blocks else 0
# 最后一个 chunk 用的是哪个 buffer
if num_chunks % 2 == 1 or num_chunks == 0:
# 奇数个 chunk或0个最后用的是 ping
return self.offload_engine.ping_slots[-1]
else:
# 偶数个 chunk最后用的是 pong
return self.offload_engine.pong_slots[-1]
def __repr__(self) -> str: def __repr__(self) -> str:
return ( return (
f"HybridKVCacheManager(\n" f"HybridKVCacheManager(\n"

View File

@@ -64,6 +64,14 @@ class OffloadEngine:
self.kv_dim = num_kv_heads * head_dim self.kv_dim = num_kv_heads * head_dim
self.block_numel = block_size * self.kv_dim self.block_numel = block_size * self.kv_dim
# ========== Ping-Pong 双缓冲配置 ==========
assert num_gpu_blocks >= 2, "Ping-Pong需要至少2个GPU blocks"
self.ping_size = num_gpu_blocks // 2
self.pong_size = num_gpu_blocks - self.ping_size
self.ping_slots = list(range(self.ping_size)) # [0, 1, 2, ...]
self.pong_slots = list(range(self.ping_size, num_gpu_blocks)) # [ping_size, ping_size+1, ...]
self.num_gpu_slots = num_gpu_blocks # alias
# ========== Fixed-address GPU KV cache ========== # ========== Fixed-address GPU KV cache ==========
# Shape: [num_layers, num_gpu_blocks, block_size, kv_heads, head_dim] # Shape: [num_layers, num_gpu_blocks, block_size, kv_heads, head_dim]
self.k_cache_gpu = torch.empty( self.k_cache_gpu = torch.empty(
@@ -103,6 +111,17 @@ class OffloadEngine:
self.compute_stream = torch.cuda.current_stream() self.compute_stream = torch.cuda.current_stream()
self._stream_idx = 0 self._stream_idx = 0
# ========== Ping-Pong 专用 stream 和事件 ==========
self.pingpong_stream = torch.cuda.Stream() # 专用于Ping-Pong传输
# 同步事件 - 加载完成
self.ping_ready = torch.cuda.Event()
self.pong_ready = torch.cuda.Event()
# 同步事件 - offload完成
self.ping_offload_done = torch.cuda.Event()
self.pong_offload_done = torch.cuda.Event()
# ========== Event tracking for async transfers ========== # ========== Event tracking for async transfers ==========
self.pending_events: Dict[Tuple[int, int], torch.cuda.Event] = {} self.pending_events: Dict[Tuple[int, int], torch.cuda.Event] = {}
@@ -516,7 +535,211 @@ class OffloadEngine:
f" kv_heads={self.num_kv_heads},\n" f" kv_heads={self.num_kv_heads},\n"
f" head_dim={self.head_dim},\n" f" head_dim={self.head_dim},\n"
f" dtype={self.dtype},\n" f" dtype={self.dtype},\n"
f" ping_size={self.ping_size}, pong_size={self.pong_size},\n"
f" gpu_memory={self.gpu_memory_bytes() / 1024**2:.1f}MB,\n" f" gpu_memory={self.gpu_memory_bytes() / 1024**2:.1f}MB,\n"
f" cpu_memory={self.cpu_memory_bytes() / 1024**2:.1f}MB\n" f" cpu_memory={self.cpu_memory_bytes() / 1024**2:.1f}MB\n"
f")" f")"
) )
# ========== Ping-Pong 双缓冲方法 ==========
def load_to_ping(self, cpu_block_ids: List[int]) -> None:
"""
异步加载CPU blocks到Ping buffer。
Args:
cpu_block_ids: 要加载的CPU block IDs列表
"""
if not cpu_block_ids:
self.ping_ready.record(self.pingpong_stream)
return
num_to_load = min(len(cpu_block_ids), self.ping_size)
logger.debug(f"Ping load: CPU{cpu_block_ids[:num_to_load]} -> GPU ping slots {self.ping_slots[:num_to_load]}")
with torch.cuda.stream(self.pingpong_stream):
for i in range(num_to_load):
cpu_id = cpu_block_ids[i]
gpu_slot = self.ping_slots[i]
# 所有层一起复制
self.k_cache_gpu[:, gpu_slot].copy_(
self.k_cache_cpu[:, cpu_id], non_blocking=True
)
self.v_cache_gpu[:, gpu_slot].copy_(
self.v_cache_cpu[:, cpu_id], non_blocking=True
)
self.ping_ready.record(self.pingpong_stream)
def load_to_pong(self, cpu_block_ids: List[int]) -> None:
"""
异步加载CPU blocks到Pong buffer。
Args:
cpu_block_ids: 要加载的CPU block IDs列表
"""
if not cpu_block_ids:
self.pong_ready.record(self.pingpong_stream)
return
num_to_load = min(len(cpu_block_ids), self.pong_size)
logger.debug(f"Pong load: CPU{cpu_block_ids[:num_to_load]} -> GPU pong slots {self.pong_slots[:num_to_load]}")
with torch.cuda.stream(self.pingpong_stream):
for i in range(num_to_load):
cpu_id = cpu_block_ids[i]
gpu_slot = self.pong_slots[i]
self.k_cache_gpu[:, gpu_slot].copy_(
self.k_cache_cpu[:, cpu_id], non_blocking=True
)
self.v_cache_gpu[:, gpu_slot].copy_(
self.v_cache_cpu[:, cpu_id], non_blocking=True
)
self.pong_ready.record(self.pingpong_stream)
def wait_ping(self) -> None:
"""等待Ping buffer加载完成。"""
self.compute_stream.wait_event(self.ping_ready)
def wait_pong(self) -> None:
"""等待Pong buffer加载完成。"""
self.compute_stream.wait_event(self.pong_ready)
def offload_buffer_to_cpu(
self,
buffer: str,
cpu_block_ids: List[int],
) -> None:
"""
异步将buffer中的KV offload到CPU。
Args:
buffer: "ping""pong"
cpu_block_ids: 目标CPU block IDs列表
"""
slots = self.ping_slots if buffer == "ping" else self.pong_slots
event = self.ping_offload_done if buffer == "ping" else self.pong_offload_done
if not cpu_block_ids:
event.record(self.pingpong_stream)
return
num_to_offload = min(len(cpu_block_ids), len(slots))
logger.debug(f"{buffer.capitalize()} offload: GPU {slots[:num_to_offload]} -> CPU{cpu_block_ids[:num_to_offload]}")
with torch.cuda.stream(self.pingpong_stream):
# 等待计算完成
self.pingpong_stream.wait_stream(self.compute_stream)
for i in range(num_to_offload):
gpu_slot = slots[i]
cpu_id = cpu_block_ids[i]
self.k_cache_cpu[:, cpu_id].copy_(
self.k_cache_gpu[:, gpu_slot], non_blocking=True
)
self.v_cache_cpu[:, cpu_id].copy_(
self.v_cache_gpu[:, gpu_slot], non_blocking=True
)
event.record(self.pingpong_stream)
def offload_slot_to_cpu(
self,
gpu_slot: int,
cpu_block_id: int,
) -> None:
"""
异步将单个GPU slot的KV offload到CPU。
Args:
gpu_slot: GPU slot ID
cpu_block_id: 目标CPU block ID
"""
logger.debug(f"Slot offload: GPU[{gpu_slot}] -> CPU[{cpu_block_id}]")
with torch.cuda.stream(self.pingpong_stream):
self.pingpong_stream.wait_stream(self.compute_stream)
self.k_cache_cpu[:, cpu_block_id].copy_(
self.k_cache_gpu[:, gpu_slot], non_blocking=True
)
self.v_cache_cpu[:, cpu_block_id].copy_(
self.v_cache_gpu[:, gpu_slot], non_blocking=True
)
def wait_ping_offload_done(self) -> None:
"""等待Ping buffer offload完成。"""
self.compute_stream.wait_event(self.ping_offload_done)
def wait_pong_offload_done(self) -> None:
"""等待Pong buffer offload完成。"""
self.compute_stream.wait_event(self.pong_offload_done)
def wait_all_offload_done(self) -> None:
"""等待所有offload完成。"""
self.pingpong_stream.synchronize()
def get_kv_for_ping_slots(
self,
layer_id: int,
num_slots: int,
) -> Tuple[Tensor, Tensor]:
"""
获取Ping buffer中指定数量slots的KV。
Args:
layer_id: 层ID
num_slots: 需要的slot数量
Returns:
(k_cache, v_cache)shape: [1, num_slots * block_size, kv_heads, head_dim]
"""
slots = self.ping_slots[:num_slots]
k = self.k_cache_gpu[layer_id, slots] # [num_slots, block_size, heads, dim]
v = self.v_cache_gpu[layer_id, slots]
# Reshape: [num_slots, block_size, heads, dim] -> [1, num_slots*block_size, heads, dim]
k = k.reshape(1, -1, self.num_kv_heads, self.head_dim)
v = v.reshape(1, -1, self.num_kv_heads, self.head_dim)
return k, v
def get_kv_for_pong_slots(
self,
layer_id: int,
num_slots: int,
) -> Tuple[Tensor, Tensor]:
"""
获取Pong buffer中指定数量slots的KV。
Args:
layer_id: 层ID
num_slots: 需要的slot数量
Returns:
(k_cache, v_cache)shape: [1, num_slots * block_size, kv_heads, head_dim]
"""
slots = self.pong_slots[:num_slots]
k = self.k_cache_gpu[layer_id, slots]
v = self.v_cache_gpu[layer_id, slots]
k = k.reshape(1, -1, self.num_kv_heads, self.head_dim)
v = v.reshape(1, -1, self.num_kv_heads, self.head_dim)
return k, v
def get_kv_for_slots(
self,
layer_id: int,
gpu_slots: List[int],
) -> Tuple[Tensor, Tensor]:
"""
获取指定GPU slots的KV。
Args:
layer_id: 层ID
gpu_slots: GPU slot IDs列表
Returns:
(k_cache, v_cache)shape: [1, len(slots) * block_size, kv_heads, head_dim]
"""
if not gpu_slots:
return None, None
k = self.k_cache_gpu[layer_id, gpu_slots]
v = self.v_cache_gpu[layer_id, gpu_slots]
k = k.reshape(1, -1, self.num_kv_heads, self.head_dim)
v = v.reshape(1, -1, self.num_kv_heads, self.head_dim)
return k, v

View File

@@ -97,51 +97,89 @@ class Attention(nn.Module):
context, context,
) -> torch.Tensor: ) -> torch.Tensor:
""" """
Compute attention with chunked KV from CPU cache. Compute attention with Ping-Pong dual buffer for chunked prefill.
For chunked prefill: For chunked prefill:
1. Load previous KV from CPU for this layer 1. Load previous KV from CPU using Ping-Pong (if any previous chunks)
2. Compute attention against previous KV (no causal mask) 2. Compute attention against previous KV chunks (no causal mask)
3. Compute attention against current chunk's KV (causal) 3. Compute attention against current chunk's KV (causal)
4. Merge results using online softmax 4. Merge all results using online softmax
""" """
from nanovllm.kvcache.chunked_attention import flash_attn_with_lse, merge_attention_outputs from nanovllm.kvcache.chunked_attention import flash_attn_with_lse, merge_attention_outputs
# q, k, v shape: [total_tokens, num_heads, head_dim] # q, k, v shape: [total_tokens, num_heads, head_dim]
total_tokens = q.shape[0]
# Reshape for flash attention: [batch, seq, heads, dim] # Reshape for flash attention: [batch, seq, heads, dim]
q_batched = q.unsqueeze(0) # [1, total_tokens, heads, dim] q_batched = q.unsqueeze(0) # [1, total_tokens, heads, dim]
k_batched = k.unsqueeze(0) k_batched = k.unsqueeze(0)
v_batched = v.unsqueeze(0) v_batched = v.unsqueeze(0)
accumulated_o = None o_acc = None
accumulated_lse = None lse_acc = None
# Load previous KV from CPU for this layer # Load previous KV from CPU using Ping-Pong
if context.offload_engine is not None and self.layer_id >= 0: # Note: context.offload_engine is actually HybridKVCacheManager
# Get the kvcache_manager from context
kvcache_manager = context.offload_engine kvcache_manager = context.offload_engine
seq = context.chunked_seq if hasattr(context, 'chunked_seq') else None
# For each sequence in the chunk, load previous KV if kvcache_manager is not None and seq is not None and self.layer_id >= 0:
# Currently assuming single sequence # Get prefilled CPU blocks (blocks already written in previous chunks)
if hasattr(context, 'chunked_seq') and context.chunked_seq is not None: cpu_block_table = kvcache_manager.get_prefilled_cpu_blocks(seq)
prev_k, prev_v = kvcache_manager.load_prev_kv_for_layer(
context.chunked_seq, if cpu_block_table:
self.layer_id, offload_engine = kvcache_manager.offload_engine
ping_size = offload_engine.ping_size
num_chunks = (len(cpu_block_table) + ping_size - 1) // ping_size
current_buffer = "ping"
# Prefetch first chunk to Ping buffer
first_chunk_end = min(ping_size, len(cpu_block_table))
first_chunk_ids = cpu_block_table[:first_chunk_end]
offload_engine.load_to_ping(first_chunk_ids)
for chunk_idx in range(num_chunks):
start = chunk_idx * ping_size
end = min(start + ping_size, len(cpu_block_table))
num_blocks_in_chunk = end - start
# Prefetch next chunk to OTHER buffer
if chunk_idx + 1 < num_chunks:
next_start = end
next_end = min(next_start + ping_size, len(cpu_block_table))
next_chunk_ids = cpu_block_table[next_start:next_end]
if current_buffer == "ping":
offload_engine.load_to_pong(next_chunk_ids)
else:
offload_engine.load_to_ping(next_chunk_ids)
# Wait for current buffer and get KV
if current_buffer == "ping":
offload_engine.wait_ping()
prev_k, prev_v = offload_engine.get_kv_for_ping_slots(
self.layer_id, num_blocks_in_chunk
)
else:
offload_engine.wait_pong()
prev_k, prev_v = offload_engine.get_kv_for_pong_slots(
self.layer_id, num_blocks_in_chunk
) )
if prev_k is not None and prev_v is not None: # Compute attention against this chunk (no causal mask)
# Compute attention against previous KV (no causal mask)
prev_o, prev_lse = flash_attn_with_lse( prev_o, prev_lse = flash_attn_with_lse(
q_batched, q_batched,
prev_k, prev_k,
prev_v, prev_v,
softmax_scale=self.scale, softmax_scale=self.scale,
causal=False, # No causal mask for previous context causal=False,
) )
accumulated_o = prev_o
accumulated_lse = prev_lse # Merge with accumulated
if o_acc is None:
o_acc, lse_acc = prev_o, prev_lse
else:
o_acc, lse_acc = merge_attention_outputs(o_acc, lse_acc, prev_o, prev_lse)
# Switch buffer
current_buffer = "pong" if current_buffer == "ping" else "ping"
# Compute attention against current chunk's KV (with causal mask) # Compute attention against current chunk's KV (with causal mask)
current_o, current_lse = flash_attn_with_lse( current_o, current_lse = flash_attn_with_lse(
@@ -149,17 +187,14 @@ class Attention(nn.Module):
k_batched, k_batched,
v_batched, v_batched,
softmax_scale=self.scale, softmax_scale=self.scale,
causal=True, # Causal mask for current chunk causal=True,
) )
# Merge with accumulated # Merge with accumulated
if accumulated_o is None: if o_acc is None:
final_o = current_o final_o = current_o
else: else:
final_o, _ = merge_attention_outputs( final_o, _ = merge_attention_outputs(o_acc, lse_acc, current_o, current_lse)
accumulated_o, accumulated_lse,
current_o, current_lse,
)
# Remove batch dimension: [1, total_tokens, heads, dim] -> [total_tokens, heads, dim] # Remove batch dimension: [1, total_tokens, heads, dim] -> [total_tokens, heads, dim]
return final_o.squeeze(0) return final_o.squeeze(0)
@@ -172,12 +207,13 @@ class Attention(nn.Module):
context, context,
) -> torch.Tensor: ) -> torch.Tensor:
""" """
Compute decode attention with KV spread across CPU and GPU. Compute decode attention with Ping-Pong dual buffer.
Uses chunked attention similar to chunked prefill: All KV is stored on CPU. Uses Ping-Pong buffers on GPU:
1. Process blocks on GPU first (if any) 1. Load first chunk to Ping buffer
2. Load CPU blocks in chunks to GPU slots (per-layer) 2. While computing on current buffer, prefetch next chunk to other buffer
3. Compute attention for each chunk, merge with online softmax 3. Alternate between Ping and Pong buffers
4. Merge attention outputs using online softmax (LSE)
""" """
from nanovllm.kvcache.chunked_attention import flash_attn_with_lse, merge_attention_outputs from nanovllm.kvcache.chunked_attention import flash_attn_with_lse, merge_attention_outputs
@@ -185,48 +221,56 @@ class Attention(nn.Module):
# Need: [batch, seqlen, heads, dim] # Need: [batch, seqlen, heads, dim]
q_batched = q.unsqueeze(1) # [batch, 1, heads, dim] q_batched = q.unsqueeze(1) # [batch, 1, heads, dim]
# Note: context.offload_engine is actually HybridKVCacheManager
kvcache_manager = context.offload_engine kvcache_manager = context.offload_engine
seq = context.chunked_seq seq = context.chunked_seq
# Get all CPU blocks for this sequence
cpu_block_table, _ = kvcache_manager.get_all_cpu_blocks(seq)
if not cpu_block_table:
raise RuntimeError("Chunked decode attention failed: no CPU blocks available")
# Get the actual offload_engine for Ping-Pong operations
offload_engine = kvcache_manager.offload_engine
# Calculate chunk info
ping_size = offload_engine.ping_size
num_chunks = (len(cpu_block_table) + ping_size - 1) // ping_size
o_acc = None o_acc = None
lse_acc = None lse_acc = None
current_buffer = "ping"
# Step 1: Process blocks already on GPU (if any) # Prefetch first chunk to Ping buffer (loads all layers at once)
gpu_slots, _ = kvcache_manager.get_gpu_blocks_for_decode(seq) first_chunk_end = min(ping_size, len(cpu_block_table))
if gpu_slots: first_chunk_ids = cpu_block_table[:first_chunk_end]
k_gpu, v_gpu = kvcache_manager.get_kv_for_gpu_slots(self.layer_id, gpu_slots) offload_engine.load_to_ping(first_chunk_ids)
o_gpu, lse_gpu = flash_attn_with_lse(
q_batched, k_gpu, v_gpu,
softmax_scale=self.scale,
causal=False,
)
o_acc, lse_acc = o_gpu, lse_gpu
# Step 2: Process CPU blocks in chunks
# Get chunk info from kvcache_manager
cpu_block_ids, cpu_logical_ids, num_chunks = kvcache_manager.get_decode_chunk_info(seq)
if num_chunks > 0:
# Use num_gpu_slots - 1 to avoid the reserved slot (used for write block)
chunk_size = kvcache_manager.num_gpu_slots - 1
for chunk_idx in range(num_chunks): for chunk_idx in range(num_chunks):
start = chunk_idx * chunk_size start = chunk_idx * ping_size
end = min(start + chunk_size, len(cpu_block_ids)) end = min(start + ping_size, len(cpu_block_table))
chunk_cpu_ids = cpu_block_ids[start:end] num_blocks_in_chunk = end - start
# Load this chunk to GPU slots 0, 1, 2, ... for THIS LAYER # Prefetch next chunk to OTHER buffer (overlapped with current computation)
# (slot num_gpu_slots-1 is reserved for write block) if chunk_idx + 1 < num_chunks:
gpu_slots_for_chunk = list(range(len(chunk_cpu_ids))) next_start = end
kvcache_manager.offload_engine.load_cpu_blocks_to_gpu_slots( next_end = min(next_start + ping_size, len(cpu_block_table))
self.layer_id, next_chunk_ids = cpu_block_table[next_start:next_end]
chunk_cpu_ids, if current_buffer == "ping":
gpu_slots_for_chunk, offload_engine.load_to_pong(next_chunk_ids)
else:
offload_engine.load_to_ping(next_chunk_ids)
# Wait for current buffer to be ready and get KV
if current_buffer == "ping":
offload_engine.wait_ping()
k_chunk, v_chunk = offload_engine.get_kv_for_ping_slots(
self.layer_id, num_blocks_in_chunk
) )
else:
# Get KV for this chunk offload_engine.wait_pong()
k_chunk, v_chunk = kvcache_manager.get_kv_for_gpu_slots( k_chunk, v_chunk = offload_engine.get_kv_for_pong_slots(
self.layer_id, gpu_slots_for_chunk self.layer_id, num_blocks_in_chunk
) )
# Compute attention for this chunk # Compute attention for this chunk
@@ -242,6 +286,9 @@ class Attention(nn.Module):
else: else:
o_acc, lse_acc = merge_attention_outputs(o_acc, lse_acc, o_chunk, lse_chunk) o_acc, lse_acc = merge_attention_outputs(o_acc, lse_acc, o_chunk, lse_chunk)
# Switch buffer for next iteration
current_buffer = "pong" if current_buffer == "ping" else "ping"
if o_acc is None: if o_acc is None:
raise RuntimeError("Chunked decode attention failed: no KV available") raise RuntimeError("Chunked decode attention failed: no KV available")

View File

@@ -14,63 +14,66 @@ os.environ["NANOVLLM_LOG_LEVEL"] = "DEBUG"
from nanovllm import LLM, SamplingParams from nanovllm import LLM, SamplingParams
def test_chunked_prefill(num_gpu_blocks=10, input_len=8192, output_len=16): def create_long_context_prompt(target_tokens: int) -> str:
"""
Create a meaningful long context prompt with a question at the end.
The answer depends on information scattered throughout the context.
"""
# Key facts to embed in the context
facts = [
"The capital of France is Paris.",
"The Eiffel Tower was built in 1889.",
"Python was created by Guido van Rossum.",
"The speed of light is approximately 299,792 kilometers per second.",
"Mount Everest is 8,848 meters tall.",
]
# Padding text to reach target length
padding_paragraph = """
This is additional context information that helps extend the length of the prompt.
Machine learning has revolutionized many fields including computer vision, natural language processing, and robotics.
Deep neural networks can learn complex patterns from large amounts of data.
The transformer architecture has become the foundation of modern language models.
Attention mechanisms allow models to focus on relevant parts of the input.
"""
# Build the prompt
prompt_parts = []
# Add instruction
prompt_parts.append("Please read the following information carefully and answer the question at the end.\n\n")
# Add facts at different positions
current_tokens = 50 # approximate tokens so far
tokens_per_padding = 80 # approximate tokens per padding paragraph
fact_interval = target_tokens // (len(facts) + 1)
fact_idx = 0
while current_tokens < target_tokens - 100:
# Add padding
prompt_parts.append(padding_paragraph)
current_tokens += tokens_per_padding
# Add a fact at intervals
if fact_idx < len(facts) and current_tokens > fact_interval * (fact_idx + 1):
prompt_parts.append(f"\n[Important Fact #{fact_idx + 1}]: {facts[fact_idx]}\n")
current_tokens += 20
fact_idx += 1
# Add the question at the end
prompt_parts.append("\n\nQuestion: Based on the information above, what is the capital of France and when was the Eiffel Tower built? Please answer briefly.\n\nAnswer:")
return "".join(prompt_parts)
def test_chunked_prefill(num_gpu_blocks=10, input_len=8192, output_len=64):
"""Test chunked prefill with limited GPU blocks.""" """Test chunked prefill with limited GPU blocks."""
path = os.path.expanduser("~/models/Qwen3-4B-Instruct-2507/") path = os.path.expanduser("~/models/Qwen3-4B-Instruct-2507/")
total_blocks = (input_len + 255) // 256
print(f"=" * 60) print(f"=" * 60)
print(f"Chunked Prefill Test") print(f"Chunked Prefill Test (Ping-Pong)")
print(f"=" * 60) print(f"=" * 60)
print(f" input_len: {input_len} tokens") print(f" target_input_len: ~{input_len} tokens")
print(f" total_blocks: {total_blocks}")
print(f" num_gpu_blocks: {num_gpu_blocks}")
print(f" blocks_on_cpu: {max(0, total_blocks - num_gpu_blocks)}")
print()
llm = LLM(
path,
enforce_eager=False,
max_model_len=16 * 1024, # 16K is enough for 8K test
max_num_batched_tokens=16 * 1024,
enable_cpu_offload=True,
cpu_memory_gb=4.0,
num_gpu_blocks=num_gpu_blocks,
)
print(f"LLM initialized:")
print(f" num_gpu_kvcache_blocks: {llm.model_runner.config.num_gpu_kvcache_blocks}")
print(f" num_cpu_kvcache_blocks: {llm.model_runner.config.num_cpu_kvcache_blocks}")
print()
# Create prompt with approximate token count
prompt = "Hello " * (input_len // 2)
print(f"Running generation...")
outputs = llm.generate(
[prompt],
SamplingParams(temperature=0.6, max_tokens=output_len),
use_tqdm=True,
)
print()
print(f"Output tokens: {len(outputs[0]['token_ids'])}")
print(f"Output text (first 100 chars): {outputs[0]['text'][:100]}")
print()
return outputs
def test_chunked_decode(num_gpu_blocks=10, input_len=8192, output_len=64):
"""Test chunked decode with limited GPU blocks."""
path = os.path.expanduser("~/models/Qwen3-4B-Instruct-2507/")
total_blocks = (input_len + 255) // 256
print(f"=" * 60)
print(f"Chunked Decode Test")
print(f"=" * 60)
print(f" input_len: {input_len} tokens")
print(f" output_len: {output_len} tokens")
print(f" total_blocks: {total_blocks}")
print(f" num_gpu_blocks: {num_gpu_blocks}") print(f" num_gpu_blocks: {num_gpu_blocks}")
print() print()
@@ -80,27 +83,62 @@ def test_chunked_decode(num_gpu_blocks=10, input_len=8192, output_len=64):
max_model_len=16 * 1024, max_model_len=16 * 1024,
max_num_batched_tokens=16 * 1024, max_num_batched_tokens=16 * 1024,
enable_cpu_offload=True, enable_cpu_offload=True,
cpu_memory_gb=4.0,
num_gpu_blocks=num_gpu_blocks, num_gpu_blocks=num_gpu_blocks,
) )
print(f"LLM initialized:")
print(f" num_gpu_kvcache_blocks: {llm.model_runner.config.num_gpu_kvcache_blocks}")
print(f" num_cpu_kvcache_blocks: {llm.model_runner.config.num_cpu_kvcache_blocks}")
print() print()
prompt = "Hello " * (input_len // 2) # Create meaningful prompt
prompt = create_long_context_prompt(input_len)
print(f"Running generation...") print(f"Running generation...")
outputs = llm.generate( outputs = llm.generate(
[prompt], [prompt],
SamplingParams(temperature=0.6, max_tokens=output_len), SamplingParams(temperature=0.1, max_tokens=output_len), # low temperature for more deterministic output
use_tqdm=True, use_tqdm=False,
) )
print() print()
print(f"Output tokens: {len(outputs[0]['token_ids'])}") print(f"Output tokens: {len(outputs[0]['token_ids'])}")
print(f"Output text (first 100 chars): {outputs[0]['text'][:100]}") print(f"Output text:\n{outputs[0]['text']}")
print()
return outputs
def test_chunked_decode(num_gpu_blocks=10, input_len=8192, output_len=128):
"""Test chunked decode with limited GPU blocks."""
path = os.path.expanduser("~/models/Qwen3-4B-Instruct-2507/")
print(f"=" * 60)
print(f"Chunked Decode Test (Ping-Pong)")
print(f"=" * 60)
print(f" target_input_len: ~{input_len} tokens")
print(f" output_len: {output_len} tokens")
print(f" num_gpu_blocks: {num_gpu_blocks}")
print()
llm = LLM(
path,
enforce_eager=False,
max_model_len=16 * 1024,
max_num_batched_tokens=16 * 1024,
enable_cpu_offload=True,
num_gpu_blocks=num_gpu_blocks,
)
print()
# Create meaningful prompt
prompt = create_long_context_prompt(input_len)
print(f"Running generation...")
outputs = llm.generate(
[prompt],
SamplingParams(temperature=0.1, max_tokens=output_len),
use_tqdm=False,
)
print()
print(f"Output tokens: {len(outputs[0]['token_ids'])}")
print(f"Output text:\n{outputs[0]['text']}")
print() print()
return outputs return outputs
@@ -108,7 +146,7 @@ def test_chunked_decode(num_gpu_blocks=10, input_len=8192, output_len=64):
if __name__ == "__main__": if __name__ == "__main__":
# Parse arguments # Parse arguments
num_gpu_blocks = int(sys.argv[1]) if len(sys.argv) > 1 else 10 num_gpu_blocks = int(sys.argv[1]) if len(sys.argv) > 1 else 10
input_len = int(sys.argv[2]) if len(sys.argv) > 2 else 8192 input_len = int(sys.argv[2]) if len(sys.argv) > 2 else 2048
output_len = int(sys.argv[3]) if len(sys.argv) > 3 else 32 output_len = int(sys.argv[3]) if len(sys.argv) > 3 else 64
test_chunked_prefill(num_gpu_blocks, input_len, output_len) test_chunked_prefill(num_gpu_blocks, input_len, output_len)