Merge branch 'tzj/minference' of ssh://git.zijie-tian.site:2222/zijie-tian/nano-vllm into tzj/minference
This commit is contained in:
@@ -231,6 +231,11 @@ class HybridKVCacheManager(KVCacheManager):
|
||||
seq.num_cached_tokens = 0
|
||||
seq.block_table.clear()
|
||||
|
||||
# Reset OffloadEngine state to prevent request-to-request contamination
|
||||
# This clears all KV buffers and pending async events
|
||||
if self.offload_engine is not None:
|
||||
self.offload_engine.reset()
|
||||
|
||||
def can_append(self, seq: Sequence) -> bool:
|
||||
"""Check if we can append a token."""
|
||||
need_new_block = (len(seq) % self._block_size == 1)
|
||||
|
||||
@@ -244,6 +244,42 @@ class OffloadEngine:
|
||||
"""
|
||||
return self.k_cache_gpu, self.v_cache_gpu
|
||||
|
||||
def reset(self) -> None:
|
||||
"""
|
||||
Reset all KV cache buffers to zero.
|
||||
|
||||
This clears all GPU and CPU-side KV cache storage, preventing
|
||||
request-to-request contamination. Must be called between generate()
|
||||
calls when reusing the same OffloadEngine instance.
|
||||
|
||||
Clears:
|
||||
- GPU ring buffer slots (k_cache_gpu, v_cache_gpu)
|
||||
- Per-layer decode buffers (decode_k_buffer, decode_v_buffer)
|
||||
- Cross-layer pipeline buffers (layer_k/v_buffer_a/b)
|
||||
- Per-layer prefill buffers (prefill_k/v_buffer)
|
||||
- All pending async transfer events
|
||||
"""
|
||||
# Clear GPU ring buffer slots
|
||||
self.k_cache_gpu.zero_()
|
||||
self.v_cache_gpu.zero_()
|
||||
|
||||
# Clear per-layer decode buffers
|
||||
self.decode_k_buffer.zero_()
|
||||
self.decode_v_buffer.zero_()
|
||||
|
||||
# Clear cross-layer pipeline buffers
|
||||
self.layer_k_buffer_a.zero_()
|
||||
self.layer_v_buffer_a.zero_()
|
||||
self.layer_k_buffer_b.zero_()
|
||||
self.layer_v_buffer_b.zero_()
|
||||
|
||||
# Clear per-layer prefill buffers
|
||||
self.prefill_k_buffer.zero_()
|
||||
self.prefill_v_buffer.zero_()
|
||||
|
||||
# Clear all pending async transfer events
|
||||
self.pending_events.clear()
|
||||
|
||||
# ========== Memory info ==========
|
||||
|
||||
def gpu_memory_bytes(self) -> int:
|
||||
|
||||
Reference in New Issue
Block a user