diff --git a/CLAUDE.md b/CLAUDE.md index 5b37e28..2413883 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -302,7 +302,7 @@ def _merge_output_kernel(...): | Parameter | Default | Notes | |-----------|---------|-------| -| `kvcache_block_size` | 4096 | Tokens per block | +| `kvcache_block_size` | 1024 | Tokens per block | | `max_num_batched_tokens` | 16384 | Set = max_model_len for long context | | `gpu_memory_utilization` | 0.9 | GPU memory fraction | | `enable_cpu_offload` | False | Enable for long context | diff --git a/nanovllm/config.py b/nanovllm/config.py index 3087d3c..97758e0 100644 --- a/nanovllm/config.py +++ b/nanovllm/config.py @@ -15,7 +15,7 @@ class Config: enforce_eager: bool = False hf_config: AutoConfig | None = None eos: int = -1 - kvcache_block_size: int = 4096 + kvcache_block_size: int = 1024 num_kvcache_blocks: int = -1 dtype: str | None = None # "float16", "bfloat16", or None (use model default) diff --git a/nanovllm/engine/sequence.py b/nanovllm/engine/sequence.py index e4e2460..b3a8f81 100644 --- a/nanovllm/engine/sequence.py +++ b/nanovllm/engine/sequence.py @@ -12,7 +12,7 @@ class SequenceStatus(Enum): class Sequence: - block_size = 4096 + block_size = 1024 counter = count() def __init__(self, token_ids: list[int], sampling_params = SamplingParams()): diff --git a/nanovllm/kvcache/sparse/policy.py b/nanovllm/kvcache/sparse/policy.py index 157b3ca..41a0f87 100644 --- a/nanovllm/kvcache/sparse/policy.py +++ b/nanovllm/kvcache/sparse/policy.py @@ -39,7 +39,7 @@ class PolicyContext: is_prefill: bool """True if in prefill phase, False if in decode phase.""" - block_size: int = 4096 + block_size: int = 1024 """Number of tokens per block.""" total_kv_len: int = 0 diff --git a/nanovllm/layers/attention.py b/nanovllm/layers/attention.py index b75a109..914c488 100644 --- a/nanovllm/layers/attention.py +++ b/nanovllm/layers/attention.py @@ -84,6 +84,10 @@ class Attention(nn.Module): context.kvcache_manager is not None and hasattr(context.kvcache_manager, 'offload_engine') ) + + #! Ensure synchronization before accessing k_cache/v_cache + torch.cuda.synchronize() + #! ======================================================= if is_chunked_offload: # Chunked offload mode: use compute_stream for store_kvcache