[feat] Added Quest Sparsity Policy.
This commit is contained in:
@@ -19,7 +19,8 @@ Usage:
|
||||
return available_blocks[:5] # Just first 5 blocks
|
||||
"""
|
||||
|
||||
from nanovllm.kvcache.sparse.policy import SparsePolicy, PolicyContext, SparsePolicyType
|
||||
from nanovllm.config import SparsePolicyType
|
||||
from nanovllm.kvcache.sparse.policy import SparsePolicy, PolicyContext
|
||||
from nanovllm.kvcache.sparse.full_policy import FullAttentionPolicy
|
||||
from nanovllm.kvcache.sparse.quest import QuestPolicy, QuestConfig, BlockMetadataManager
|
||||
|
||||
|
||||
@@ -7,15 +7,11 @@ from CPU for each query chunk during chunked attention computation.
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
from dataclasses import dataclass
|
||||
from enum import Enum, auto
|
||||
from typing import List, Optional, Any
|
||||
import torch
|
||||
|
||||
|
||||
class SparsePolicyType(Enum):
|
||||
"""Built-in sparse attention policy types."""
|
||||
FULL = auto() # prefill + decode
|
||||
QUEST = auto() # decode only
|
||||
# Import SparsePolicyType from config to avoid circular imports
|
||||
from nanovllm.config import SparsePolicyType
|
||||
|
||||
|
||||
@dataclass
|
||||
|
||||
Reference in New Issue
Block a user