[claudesquad] update from 'int-minference-1' on 08 Jan 26 23:22 CST
This commit is contained in:
@@ -25,6 +25,7 @@ class FullAttentionPolicy(SparsePolicy):
|
||||
# Full attention supports both prefill and decode
|
||||
supports_prefill = True
|
||||
supports_decode = True
|
||||
requires_block_selection = False # Load all blocks, no selective loading
|
||||
|
||||
def select_blocks(
|
||||
self,
|
||||
|
||||
@@ -30,6 +30,7 @@ class MInferencePolicy(SparsePolicy):
|
||||
|
||||
supports_prefill = True
|
||||
supports_decode = False # MInference is prefill-only sparse strategy
|
||||
requires_block_selection = False # MInference only affects attention computation, not KV load
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
|
||||
@@ -77,6 +77,12 @@ class SparsePolicy(ABC):
|
||||
supports_prefill: bool = True
|
||||
supports_decode: bool = True
|
||||
|
||||
# Whether this policy requires selective block loading during decode
|
||||
# If True: OffloadEngine will call select_blocks() before loading KV from CPU
|
||||
# If False: OffloadEngine will load all blocks (select_blocks ignored for load)
|
||||
# Example: MInference=False (only affects attention), Quest=True (affects load)
|
||||
requires_block_selection: bool = False
|
||||
|
||||
def initialize(
|
||||
self,
|
||||
num_layers: int,
|
||||
|
||||
@@ -158,6 +158,7 @@ class QuestPolicy(SparsePolicy):
|
||||
# Quest is decode-only
|
||||
supports_prefill = False
|
||||
supports_decode = True
|
||||
requires_block_selection = True # Quest affects KV load strategy (selective block loading)
|
||||
|
||||
def __init__(self, config: QuestConfig):
|
||||
"""
|
||||
|
||||
Reference in New Issue
Block a user