[feat] Added chunked prefill and kvcache offload mechenism.

This commit is contained in:
Zijie Tian
2025-12-10 03:47:37 +08:00
parent 204fe2b38f
commit 0b6f19242d
25 changed files with 4414 additions and 61 deletions

View File

@@ -1,19 +1,22 @@
from collections import deque
from time import perf_counter_ns
from typing import TYPE_CHECKING
from nanovllm.config import Config
from nanovllm.engine.sequence import Sequence, SequenceStatus
from nanovllm.engine.block_manager import BlockManager
from nanovllm.utils.observer import Observer
if TYPE_CHECKING:
from nanovllm.kvcache import KVCacheManager
class Scheduler:
def __init__(self, config: Config):
def __init__(self, config: Config, kvcache_manager: "KVCacheManager"):
self.max_num_seqs = config.max_num_seqs
self.max_num_batched_tokens = config.max_num_batched_tokens
self.eos = config.eos
self.block_manager = BlockManager(config.num_kvcache_blocks, config.kvcache_block_size)
self.kvcache_manager = kvcache_manager
self.waiting: deque[Sequence] = deque()
self.running: deque[Sequence] = deque()
@@ -32,10 +35,10 @@ class Scheduler:
if Observer.ttft_start == 0:
Observer.ttft_start = perf_counter_ns()
seq = self.waiting[0]
if num_batched_tokens + len(seq) > self.max_num_batched_tokens or not self.block_manager.can_allocate(seq):
if num_batched_tokens + len(seq) > self.max_num_batched_tokens or not self.kvcache_manager.can_allocate(seq):
break
num_seqs += 1
self.block_manager.allocate(seq)
self.kvcache_manager.allocate(seq)
num_batched_tokens += len(seq) - seq.num_cached_tokens
seq.status = SequenceStatus.RUNNING
self.waiting.popleft()
@@ -47,7 +50,7 @@ class Scheduler:
# decode
while self.running and num_seqs < self.max_num_seqs:
seq = self.running.popleft()
while not self.block_manager.can_append(seq):
while not self.kvcache_manager.can_append(seq):
if self.running:
self.preempt(self.running.pop())
else:
@@ -55,7 +58,7 @@ class Scheduler:
break
else:
num_seqs += 1
self.block_manager.may_append(seq)
self.kvcache_manager.may_append(seq)
scheduled_seqs.append(seq)
assert scheduled_seqs
self.running.extendleft(reversed(scheduled_seqs))
@@ -63,7 +66,7 @@ class Scheduler:
def preempt(self, seq: Sequence):
seq.status = SequenceStatus.WAITING
self.block_manager.deallocate(seq)
self.kvcache_manager.deallocate(seq)
self.waiting.appendleft(seq)
def postprocess(self, seqs: list[Sequence], token_ids: list[int]) -> list[bool]:
@@ -71,5 +74,5 @@ class Scheduler:
seq.append_token(token_id)
if (not seq.ignore_eos and token_id == self.eos) or seq.num_completion_tokens == seq.max_tokens:
seq.status = SequenceStatus.FINISHED
self.block_manager.deallocate(seq)
self.kvcache_manager.deallocate(seq)
self.running.remove(seq)