[fix] fixed request to request error.

This commit is contained in:
Zijie Tian
2026-01-19 00:55:26 +08:00
parent e6e0dc5d7d
commit 50520a6c3c
4 changed files with 76 additions and 3 deletions

View File

@@ -231,6 +231,11 @@ class HybridKVCacheManager(KVCacheManager):
seq.num_cached_tokens = 0
seq.block_table.clear()
# Reset OffloadEngine state to prevent request-to-request contamination
# This clears all KV buffers and pending async events
if self.offload_engine is not None:
self.offload_engine.reset()
def can_append(self, seq: Sequence) -> bool:
"""Check if we can append a token."""
need_new_block = (len(seq) % self._block_size == 1)

View File

@@ -278,6 +278,42 @@ class OffloadEngine:
"""
return self.k_cache_gpu, self.v_cache_gpu
def reset(self) -> None:
"""
Reset all KV cache buffers to zero.
This clears all GPU and CPU-side KV cache storage, preventing
request-to-request contamination. Must be called between generate()
calls when reusing the same OffloadEngine instance.
Clears:
- GPU ring buffer slots (k_cache_gpu, v_cache_gpu)
- Per-layer decode buffers (decode_k_buffer, decode_v_buffer)
- Cross-layer pipeline buffers (layer_k/v_buffer_a/b)
- Per-layer prefill buffers (prefill_k/v_buffer)
- All pending async transfer events
"""
# Clear GPU ring buffer slots
self.k_cache_gpu.zero_()
self.v_cache_gpu.zero_()
# Clear per-layer decode buffers
self.decode_k_buffer.zero_()
self.decode_v_buffer.zero_()
# Clear cross-layer pipeline buffers
self.layer_k_buffer_a.zero_()
self.layer_v_buffer_a.zero_()
self.layer_k_buffer_b.zero_()
self.layer_v_buffer_b.zero_()
# Clear per-layer prefill buffers
self.prefill_k_buffer.zero_()
self.prefill_v_buffer.zero_()
# Clear all pending async transfer events
self.pending_events.clear()
# ========== Memory info ==========
def gpu_memory_bytes(self) -> int: