[claudesquad] update from 'int-minference-1' on 08 Jan 26 23:22 CST

2026-01-08 23:22:38 +08:00
parent 0bfe1984ef
commit ea4e904de0
11 changed files with 853 additions and 533 deletions
--- a/nanovllm/kvcache/sparse/full_policy.py
+++ b/nanovllm/kvcache/sparse/full_policy.py
@@ -25,6 +25,7 @@ class FullAttentionPolicy(SparsePolicy):
    # Full attention supports both prefill and decode
    supports_prefill = True
    supports_decode = True
+    requires_block_selection = False  # Load all blocks, no selective loading

    def select_blocks(
        self,
--- a/nanovllm/kvcache/sparse/minference.py
+++ b/nanovllm/kvcache/sparse/minference.py
@@ -30,6 +30,7 @@ class MInferencePolicy(SparsePolicy):

    supports_prefill = True
    supports_decode = False  # MInference is prefill-only sparse strategy
+    requires_block_selection = False  # MInference only affects attention computation, not KV load

    def __init__(
        self,
--- a/nanovllm/kvcache/sparse/policy.py
+++ b/nanovllm/kvcache/sparse/policy.py
@@ -77,6 +77,12 @@ class SparsePolicy(ABC):
    supports_prefill: bool = True
    supports_decode: bool = True

+    # Whether this policy requires selective block loading during decode
+    # If True: OffloadEngine will call select_blocks() before loading KV from CPU
+    # If False: OffloadEngine will load all blocks (select_blocks ignored for load)
+    # Example: MInference=False (only affects attention), Quest=True (affects load)
+    requires_block_selection: bool = False
+
    def initialize(
        self,
        num_layers: int,
--- a/nanovllm/kvcache/sparse/quest.py
+++ b/nanovllm/kvcache/sparse/quest.py
@@ -158,6 +158,7 @@ class QuestPolicy(SparsePolicy):
    # Quest is decode-only
    supports_prefill = False
    supports_decode = True
+    requires_block_selection = True  # Quest affects KV load strategy (selective block loading)

    def __init__(self, config: QuestConfig):
        """