[feat] Need to optimized with async prefetch.

This commit is contained in:
Zijie Tian
2025-12-15 06:58:40 +08:00
parent 1081ab51ea
commit b8b6478506
9 changed files with 556 additions and 404 deletions

View File

@@ -41,8 +41,8 @@ def main():
max_model_len=128 * 1024,
max_num_batched_tokens=128 * 1024,
enable_cpu_offload=True,
num_gpu_blocks=6,
num_prefetch_blocks=2,
num_gpu_blocks=120,
num_prefetch_blocks=4,
)
# Warmup
@@ -54,12 +54,12 @@ def main():
# bench_prefill(llm, num_seqs=1, input_len=1024)
# bench_prefill(llm, num_seqs=1, input_len=2048)
# bench_prefill(llm, num_seqs=1, input_len=4096)
bench_prefill(llm, num_seqs=1, input_len=64 * 1024)
bench_prefill(llm, num_seqs=1, input_len=16 * 1024)
print("=" * 60)
print("Decode Benchmark (CPU Offload)")
print("=" * 60)
bench_decode(llm, num_seqs=1, input_len=64 * 1024, max_output_len=128)
bench_decode(llm, num_seqs=1, input_len=16 * 1024, max_output_len=128)
# bench_decode(llm, num_seqs=1, input_len=2048, max_output_len=128)