[fix] Fixed needle test bug.

This commit is contained in:
Zijie Tian
2026-01-05 18:34:09 +08:00
parent d623043a3c
commit 054aaff403
5 changed files with 8 additions and 4 deletions

View File

@@ -302,7 +302,7 @@ def _merge_output_kernel(...):
| Parameter | Default | Notes |
|-----------|---------|-------|
| `kvcache_block_size` | 4096 | Tokens per block |
| `kvcache_block_size` | 1024 | Tokens per block |
| `max_num_batched_tokens` | 16384 | Set = max_model_len for long context |
| `gpu_memory_utilization` | 0.9 | GPU memory fraction |
| `enable_cpu_offload` | False | Enable for long context |

View File

@@ -15,7 +15,7 @@ class Config:
enforce_eager: bool = False
hf_config: AutoConfig | None = None
eos: int = -1
kvcache_block_size: int = 4096
kvcache_block_size: int = 1024
num_kvcache_blocks: int = -1
dtype: str | None = None # "float16", "bfloat16", or None (use model default)

View File

@@ -12,7 +12,7 @@ class SequenceStatus(Enum):
class Sequence:
block_size = 4096
block_size = 1024
counter = count()
def __init__(self, token_ids: list[int], sampling_params = SamplingParams()):

View File

@@ -39,7 +39,7 @@ class PolicyContext:
is_prefill: bool
"""True if in prefill phase, False if in decode phase."""
block_size: int = 4096
block_size: int = 1024
"""Number of tokens per block."""
total_kv_len: int = 0

View File

@@ -84,6 +84,10 @@ class Attention(nn.Module):
context.kvcache_manager is not None and
hasattr(context.kvcache_manager, 'offload_engine')
)
#! Ensure synchronization before accessing k_cache/v_cache
torch.cuda.synchronize()
#! =======================================================
if is_chunked_offload:
# Chunked offload mode: use compute_stream for store_kvcache