[fix] Fixed needle test bug.
This commit is contained in:
@@ -302,7 +302,7 @@ def _merge_output_kernel(...):
|
||||
|
||||
| Parameter | Default | Notes |
|
||||
|-----------|---------|-------|
|
||||
| `kvcache_block_size` | 4096 | Tokens per block |
|
||||
| `kvcache_block_size` | 1024 | Tokens per block |
|
||||
| `max_num_batched_tokens` | 16384 | Set = max_model_len for long context |
|
||||
| `gpu_memory_utilization` | 0.9 | GPU memory fraction |
|
||||
| `enable_cpu_offload` | False | Enable for long context |
|
||||
|
||||
@@ -15,7 +15,7 @@ class Config:
|
||||
enforce_eager: bool = False
|
||||
hf_config: AutoConfig | None = None
|
||||
eos: int = -1
|
||||
kvcache_block_size: int = 4096
|
||||
kvcache_block_size: int = 1024
|
||||
num_kvcache_blocks: int = -1
|
||||
dtype: str | None = None # "float16", "bfloat16", or None (use model default)
|
||||
|
||||
|
||||
@@ -12,7 +12,7 @@ class SequenceStatus(Enum):
|
||||
|
||||
|
||||
class Sequence:
|
||||
block_size = 4096
|
||||
block_size = 1024
|
||||
counter = count()
|
||||
|
||||
def __init__(self, token_ids: list[int], sampling_params = SamplingParams()):
|
||||
|
||||
@@ -39,7 +39,7 @@ class PolicyContext:
|
||||
is_prefill: bool
|
||||
"""True if in prefill phase, False if in decode phase."""
|
||||
|
||||
block_size: int = 4096
|
||||
block_size: int = 1024
|
||||
"""Number of tokens per block."""
|
||||
|
||||
total_kv_len: int = 0
|
||||
|
||||
@@ -84,6 +84,10 @@ class Attention(nn.Module):
|
||||
context.kvcache_manager is not None and
|
||||
hasattr(context.kvcache_manager, 'offload_engine')
|
||||
)
|
||||
|
||||
#! Ensure synchronization before accessing k_cache/v_cache
|
||||
torch.cuda.synchronize()
|
||||
#! =======================================================
|
||||
|
||||
if is_chunked_offload:
|
||||
# Chunked offload mode: use compute_stream for store_kvcache
|
||||
|
||||
Reference in New Issue
Block a user