[fix] Fixed needle test bug.
This commit is contained in:
@@ -302,7 +302,7 @@ def _merge_output_kernel(...):
|
|||||||
|
|
||||||
| Parameter | Default | Notes |
|
| Parameter | Default | Notes |
|
||||||
|-----------|---------|-------|
|
|-----------|---------|-------|
|
||||||
| `kvcache_block_size` | 4096 | Tokens per block |
|
| `kvcache_block_size` | 1024 | Tokens per block |
|
||||||
| `max_num_batched_tokens` | 16384 | Set = max_model_len for long context |
|
| `max_num_batched_tokens` | 16384 | Set = max_model_len for long context |
|
||||||
| `gpu_memory_utilization` | 0.9 | GPU memory fraction |
|
| `gpu_memory_utilization` | 0.9 | GPU memory fraction |
|
||||||
| `enable_cpu_offload` | False | Enable for long context |
|
| `enable_cpu_offload` | False | Enable for long context |
|
||||||
|
|||||||
@@ -15,7 +15,7 @@ class Config:
|
|||||||
enforce_eager: bool = False
|
enforce_eager: bool = False
|
||||||
hf_config: AutoConfig | None = None
|
hf_config: AutoConfig | None = None
|
||||||
eos: int = -1
|
eos: int = -1
|
||||||
kvcache_block_size: int = 4096
|
kvcache_block_size: int = 1024
|
||||||
num_kvcache_blocks: int = -1
|
num_kvcache_blocks: int = -1
|
||||||
dtype: str | None = None # "float16", "bfloat16", or None (use model default)
|
dtype: str | None = None # "float16", "bfloat16", or None (use model default)
|
||||||
|
|
||||||
|
|||||||
@@ -12,7 +12,7 @@ class SequenceStatus(Enum):
|
|||||||
|
|
||||||
|
|
||||||
class Sequence:
|
class Sequence:
|
||||||
block_size = 4096
|
block_size = 1024
|
||||||
counter = count()
|
counter = count()
|
||||||
|
|
||||||
def __init__(self, token_ids: list[int], sampling_params = SamplingParams()):
|
def __init__(self, token_ids: list[int], sampling_params = SamplingParams()):
|
||||||
|
|||||||
@@ -39,7 +39,7 @@ class PolicyContext:
|
|||||||
is_prefill: bool
|
is_prefill: bool
|
||||||
"""True if in prefill phase, False if in decode phase."""
|
"""True if in prefill phase, False if in decode phase."""
|
||||||
|
|
||||||
block_size: int = 4096
|
block_size: int = 1024
|
||||||
"""Number of tokens per block."""
|
"""Number of tokens per block."""
|
||||||
|
|
||||||
total_kv_len: int = 0
|
total_kv_len: int = 0
|
||||||
|
|||||||
@@ -85,6 +85,10 @@ class Attention(nn.Module):
|
|||||||
hasattr(context.kvcache_manager, 'offload_engine')
|
hasattr(context.kvcache_manager, 'offload_engine')
|
||||||
)
|
)
|
||||||
|
|
||||||
|
#! Ensure synchronization before accessing k_cache/v_cache
|
||||||
|
torch.cuda.synchronize()
|
||||||
|
#! =======================================================
|
||||||
|
|
||||||
if is_chunked_offload:
|
if is_chunked_offload:
|
||||||
# Chunked offload mode: use compute_stream for store_kvcache
|
# Chunked offload mode: use compute_stream for store_kvcache
|
||||||
# This ensures proper synchronization with per-layer offload
|
# This ensures proper synchronization with per-layer offload
|
||||||
|
|||||||
Reference in New Issue
Block a user