[fix] Fixed needle test bug.

2026-01-05 18:34:09 +08:00
parent d623043a3c
commit 054aaff403
5 changed files with 8 additions and 4 deletions
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -302,7 +302,7 @@ def _merge_output_kernel(...):
 | Parameter | Default | Notes |
 |-----------|---------|-------|
-| `kvcache_block_size` | 4096 | Tokens per block |
+| `kvcache_block_size` | 1024 | Tokens per block |
 | `max_num_batched_tokens` | 16384 | Set = max_model_len for long context |
 | `gpu_memory_utilization` | 0.9 | GPU memory fraction |
 | `enable_cpu_offload` | False | Enable for long context |
--- a/nanovllm/config.py
+++ b/nanovllm/config.py
@@ -15,7 +15,7 @@ class Config:
    enforce_eager: bool = False
    hf_config: AutoConfig | None = None
    eos: int = -1
-    kvcache_block_size: int = 4096
+    kvcache_block_size: int = 1024
    num_kvcache_blocks: int = -1
    dtype: str | None = None  # "float16", "bfloat16", or None (use model default)
--- a/nanovllm/engine/sequence.py
+++ b/nanovllm/engine/sequence.py
@@ -12,7 +12,7 @@ class SequenceStatus(Enum):
 class Sequence:
-    block_size = 4096
+    block_size = 1024
    counter = count()
    def __init__(self, token_ids: list[int], sampling_params = SamplingParams()):
--- a/nanovllm/kvcache/sparse/policy.py
+++ b/nanovllm/kvcache/sparse/policy.py
@@ -39,7 +39,7 @@ class PolicyContext:
    is_prefill: bool
    """True if in prefill phase, False if in decode phase."""
-    block_size: int = 4096
+    block_size: int = 1024
    """Number of tokens per block."""
    total_kv_len: int = 0
--- a/nanovllm/layers/attention.py
+++ b/nanovllm/layers/attention.py
@@ -85,6 +85,10 @@ class Attention(nn.Module):
            hasattr(context.kvcache_manager, 'offload_engine')
        )
        #! Ensure synchronization before accessing k_cache/v_cache
        torch.cuda.synchronize()
        #! =======================================================
        if is_chunked_offload:
            # Chunked offload mode: use compute_stream for store_kvcache
            # This ensures proper synchronization with per-layer offload