simplify

2025-06-21 17:04:53 +08:00
parent ad4e95fbdc
commit cde3fc22c2
9 changed files with 42 additions and 100 deletions
--- a/nanovllm/engine/block_manager.py
+++ b/nanovllm/engine/block_manager.py
@@ -5,14 +5,6 @@ import numpy as np
 from nanovllm.engine.sequence import Sequence


-def compute_hash(token_ids: list[int], prefix: int = -1):
-    h = xxhash.xxh64()
-    if prefix != -1:
-        h.update(prefix.to_bytes(8, "little"))
-    h.update(np.array(token_ids).tobytes())
-    return h.intdigest()
-
-
 class Block:

    def __init__(self, block_id):
@@ -22,7 +14,6 @@ class Block:
        self.token_ids = []

    def update(self, hash: int, token_ids: list[int]):
-        assert hash != -1
        self.hash = hash
        self.token_ids = token_ids

@@ -42,7 +33,15 @@ class BlockManager:
        self.free_block_ids: deque[int] = deque(range(num_blocks))
        self.used_block_ids: set[int] = set()

-    def _allocate_block(self, block_id: int):
+    @classmethod
+    def compute_hash(cls, token_ids: list[int], prefix: int = -1):
+        h = xxhash.xxh64()
+        if prefix != -1:
+            h.update(prefix.to_bytes(8, "little"))
+        h.update(np.array(token_ids).tobytes())
+        return h.intdigest()
+
+    def _allocate_block(self, block_id: int) -> Block:
        block = self.blocks[block_id]
        assert block.ref_count == 0
        block.reset()
@@ -50,12 +49,12 @@ class BlockManager:
        self.used_block_ids.add(block_id)
        return self.blocks[block_id]

-    def _deallocate_block(self, block_id: int):
+    def _deallocate_block(self, block_id: int) -> Block:
        assert self.blocks[block_id].ref_count == 0
        self.used_block_ids.remove(block_id)
        self.free_block_ids.append(block_id)

-    def can_allocate(self, seq: Sequence):
+    def can_allocate(self, seq: Sequence) -> bool:
        return len(self.free_block_ids) >= seq.num_blocks

    def allocate(self, seq: Sequence):
@@ -64,7 +63,7 @@ class BlockManager:
        cache_miss = False
        for i in range(seq.num_blocks):
            token_ids = seq.block(i)
-            h = compute_hash(token_ids, h) if len(token_ids) == self.block_size else -1
+            h = self.compute_hash(token_ids, h) if len(token_ids) == self.block_size else -1
            block_id = self.hash_to_block_id.get(h, -1)
            if block_id == -1 or self.blocks[block_id].token_ids != token_ids:
                cache_miss = True
@@ -92,7 +91,7 @@ class BlockManager:
        seq.num_cached_tokens = 0
        seq.block_table.clear()

-    def can_append(self, seq: Sequence):
+    def can_append(self, seq: Sequence) -> bool:
        return len(self.free_block_ids) >= (len(seq) % self.block_size == 1)

    def may_append(self, seq: Sequence):
@@ -107,7 +106,7 @@ class BlockManager:
            assert last_block.hash == -1
            token_ids = seq.block(seq.num_blocks-1)
            prefix = self.blocks[block_table[-2]].hash if len(block_table) > 1 else -1
-            h = compute_hash(token_ids, prefix)
+            h = self.compute_hash(token_ids, prefix)
            last_block.update(h, token_ids)
            self.hash_to_block_id[h] = last_block.block_id
        else:
--- a/nanovllm/engine/model_runner.py
+++ b/nanovllm/engine/model_runner.py
@@ -6,10 +6,9 @@ from multiprocessing.shared_memory import SharedMemory

 from nanovllm.config import Config
 from nanovllm.engine.sequence import Sequence
-from nanovllm.utils.context import set_context, get_context, reset_context
-from nanovllm.utils.memory import get_gpu_memory
 from nanovllm.models.qwen3 import Qwen3ForCausalLM
 from nanovllm.layers.sampler import Sampler
+from nanovllm.utils.context import set_context, get_context, reset_context
 from nanovllm.utils.loader import load_model


@@ -93,11 +92,11 @@ class ModelRunner:
    def allocate_kv_cache(self, gpu_memory_utilization):
        config = self.config
        hf_config = config.hf_config
-        total, used, _ = get_gpu_memory()
-        free = total * gpu_memory_utilization - used
+        free, total = torch.cuda.mem_get_info()
+        used = total - free
        num_kv_heads = hf_config.num_key_value_heads // self.world_size
        block_bytes = 2 * hf_config.num_hidden_layers * self.block_size * num_kv_heads * hf_config.head_dim * hf_config.torch_dtype.itemsize
-        config.num_kvcache_blocks = int(free) // block_bytes
+        config.num_kvcache_blocks = int(total * gpu_memory_utilization - used) // block_bytes
        self.kv_cache = torch.zeros(2, hf_config.num_hidden_layers, config.num_kvcache_blocks, self.block_size, num_kv_heads, hf_config.head_dim)
        layer_id = 0
        for module in self.model.modules():
@@ -142,7 +141,6 @@ class ModelRunner:
                    end = start + seq.last_block_num_tokens 
                slot_mapping.extend(list(range(start, end)))
        assert len(input_ids) == len(slot_mapping)
-        assert len(input_ids) == cu_seqlens_q[-1]
        if cu_seqlens_k[-1] > cu_seqlens_q[-1]:    # prefix cache
            block_tables = self.prepare_block_tables(seqs)
        input_ids = torch.tensor(input_ids, dtype=torch.int64, pin_memory=True).cuda(non_blocking=True)
--- a/nanovllm/engine/scheduler.py
+++ b/nanovllm/engine/scheduler.py
@@ -53,10 +53,8 @@ class Scheduler:
                num_seqs += 1
                self.block_manager.may_append(seq)
                scheduled_seqs.append(seq)
-        running = deque(scheduled_seqs)
-        running.extend(self.running)
-        self.running = running
        assert scheduled_seqs
+        self.running.extendleft(reversed(scheduled_seqs))
        return scheduled_seqs, False

    def preempt(self, seq: Sequence):