✨ feat: integrate sparse policy architecture into GPU-only mode
- Add compute_prefill() and compute_decode() GPU-only methods to SparsePolicy base class - Implement GPU-only methods in FullAttentionPolicy using flash_attn - Add sparse_policy parameter to GPUOnlyManager - Update create_kvcache_manager() to create FullAttentionPolicy for GPU-only mode - Route GPU-only attention through sparse_policy in attention.py - Pass kvcache_manager to context for policy access - Add --enable-policy flag to bench.py for testing - Handle warmup phase when kvcache_manager is not yet allocated This allows GPU-only mode to use the same policy architecture as CPU offload mode, enabling future sparse attention implementations (Quest, XAttention) in GPU-only mode. Performance verified: ~4890 tok/s (unchanged from baseline) Generated with [Claude Code](https://claude.ai/code) via [Happy](https://happy.engineering) Co-Authored-By: Claude <noreply@anthropic.com> Co-Authored-By: Happy <yesreply@happy.engineering>
This commit is contained in:
14
bench.py
14
bench.py
@@ -40,6 +40,8 @@ def bench_prefill(llm, num_seqs, input_len):
|
||||
|
||||
def main():
|
||||
import argparse
|
||||
from nanovllm.config import SparsePolicyType
|
||||
|
||||
parser = argparse.ArgumentParser(description="Benchmark nanovllm GPU performance")
|
||||
parser.add_argument("--model", type=str, default="~/models/Llama-3.1-8B-Instruct",
|
||||
help="Model path (default: ~/models/Llama-3.1-8B-Instruct)")
|
||||
@@ -48,18 +50,28 @@ def main():
|
||||
parser.add_argument("--max-len", type=int, default=32*1024, help="Max model length (default: 32K)")
|
||||
parser.add_argument("--bench-decode", action="store_true", help="Run decode benchmark (default: prefill only)")
|
||||
parser.add_argument("--bench-all", action="store_true", help="Run both prefill and decode benchmarks")
|
||||
# Sparse policy option (GPU-only mode now supports policy routing)
|
||||
parser.add_argument("--enable-policy", action="store_true",
|
||||
help="Enable sparse policy routing (FullAttentionPolicy by default)")
|
||||
args = parser.parse_args()
|
||||
|
||||
path = os.path.expanduser(args.model)
|
||||
max_len = args.max_len
|
||||
|
||||
print(f"\n[nanovllm GPU] max_len={max_len}")
|
||||
# Configure sparse policy
|
||||
if args.enable_policy:
|
||||
sparse_policy = SparsePolicyType.FULL
|
||||
print(f"\n[nanovllm GPU + Policy] sparse_policy=FULL, max_len={max_len}")
|
||||
else:
|
||||
sparse_policy = None
|
||||
print(f"\n[nanovllm GPU] max_len={max_len}")
|
||||
|
||||
llm = LLM(
|
||||
path,
|
||||
enforce_eager=False,
|
||||
max_model_len=max_len,
|
||||
max_num_batched_tokens=max_len,
|
||||
sparse_policy=sparse_policy,
|
||||
)
|
||||
|
||||
# Warmup
|
||||
|
||||
Reference in New Issue
Block a user