[WIP] replace merge attention with triton kernel.
This commit is contained in:
221
tests/test_attention_offload.py
Normal file
221
tests/test_attention_offload.py
Normal file
@@ -0,0 +1,221 @@
|
||||
"""
|
||||
Test Attention layer with KV cache offload in isolation.
|
||||
|
||||
This test demonstrates how to use Attention + HybridKVCacheManager directly
|
||||
without requiring full LLMEngine/ModelRunner setup.
|
||||
"""
|
||||
|
||||
import torch
|
||||
from nanovllm.layers.attention import Attention
|
||||
from nanovllm.kvcache.hybrid_manager import HybridKVCacheManager
|
||||
from nanovllm.engine.sequence import Sequence
|
||||
from nanovllm.utils.context import set_context, reset_context
|
||||
|
||||
|
||||
# ============================================================
|
||||
# Configuration
|
||||
# ============================================================
|
||||
|
||||
NUM_LAYERS = 8 # Multi-layer for realistic profiling
|
||||
NUM_HEADS = 8
|
||||
NUM_KV_HEADS = 8
|
||||
HEAD_DIM = 64
|
||||
BLOCK_SIZE = 1024 # tokens per block
|
||||
CHUNK_SIZE = 1024 # tokens per chunk (same as block for simplicity)
|
||||
|
||||
NUM_GPU_SLOTS = 4
|
||||
NUM_CPU_BLOCKS = 16
|
||||
|
||||
DTYPE = torch.float16
|
||||
DEVICE = "cuda"
|
||||
|
||||
|
||||
# ============================================================
|
||||
# Setup: Create Manager and Attention Layers
|
||||
# ============================================================
|
||||
|
||||
def create_manager():
|
||||
"""Create and initialize HybridKVCacheManager with OffloadEngine."""
|
||||
manager = HybridKVCacheManager(
|
||||
num_gpu_slots=NUM_GPU_SLOTS,
|
||||
num_cpu_blocks=NUM_CPU_BLOCKS,
|
||||
block_size=BLOCK_SIZE,
|
||||
)
|
||||
|
||||
# Initialize offload engine (this creates k_cache_gpu/cpu, v_cache_gpu/cpu)
|
||||
manager.allocate_cache(
|
||||
num_layers=NUM_LAYERS,
|
||||
num_kv_heads=NUM_KV_HEADS,
|
||||
head_dim=HEAD_DIM,
|
||||
dtype=DTYPE,
|
||||
)
|
||||
|
||||
return manager
|
||||
|
||||
|
||||
def create_attention_layers(manager):
|
||||
"""Create attention layers and bind KV cache."""
|
||||
layers = []
|
||||
for layer_id in range(NUM_LAYERS):
|
||||
attn = Attention(
|
||||
num_heads=NUM_HEADS,
|
||||
head_dim=HEAD_DIM,
|
||||
scale=HEAD_DIM ** -0.5,
|
||||
num_kv_heads=NUM_KV_HEADS,
|
||||
)
|
||||
attn.layer_id = layer_id
|
||||
|
||||
# Bind KV cache from manager
|
||||
k_cache, v_cache = manager.get_layer_cache(layer_id)
|
||||
attn.k_cache = k_cache
|
||||
attn.v_cache = v_cache
|
||||
|
||||
layers.append(attn.to(DEVICE))
|
||||
|
||||
return layers
|
||||
|
||||
|
||||
def create_test_sequence(manager, num_chunks=3):
|
||||
"""Create a test sequence and allocate blocks."""
|
||||
total_tokens = num_chunks * CHUNK_SIZE
|
||||
|
||||
# Sequence only takes token_ids
|
||||
seq = Sequence(token_ids=list(range(total_tokens)))
|
||||
|
||||
# Set block_size for this test
|
||||
seq.block_size = BLOCK_SIZE
|
||||
|
||||
# Allocate blocks (will be on CPU in CPU-primary mode)
|
||||
manager.allocate(seq)
|
||||
|
||||
return seq
|
||||
|
||||
|
||||
# ============================================================
|
||||
# Chunked Prefill Simulation
|
||||
# ============================================================
|
||||
|
||||
def simulate_chunk_forward(
|
||||
layers,
|
||||
manager,
|
||||
seq,
|
||||
chunk_idx,
|
||||
chunk_size,
|
||||
):
|
||||
"""
|
||||
Simulate forward pass for one chunk through all layers.
|
||||
|
||||
Returns:
|
||||
output: Final layer attention output
|
||||
"""
|
||||
# Generate random Q, K, V for this chunk
|
||||
hidden = torch.randn(chunk_size, NUM_HEADS, HEAD_DIM, dtype=DTYPE, device=DEVICE)
|
||||
|
||||
# Build slot_mapping: maps token positions to GPU slots
|
||||
write_slot = manager.offload_engine.get_write_slot_for_prefill(chunk_idx)
|
||||
slot_mapping = torch.full((chunk_size,), write_slot * BLOCK_SIZE, dtype=torch.long, device=DEVICE)
|
||||
slot_mapping += torch.arange(chunk_size, device=DEVICE)
|
||||
|
||||
# Build cu_seqlens for flash attention
|
||||
cu_seqlens = torch.tensor([0, chunk_size], dtype=torch.int32, device=DEVICE)
|
||||
|
||||
# Set context for this chunk
|
||||
set_context(
|
||||
is_prefill=True,
|
||||
is_chunked_prefill=True,
|
||||
cu_seqlens_q=cu_seqlens,
|
||||
cu_seqlens_k=cu_seqlens,
|
||||
max_seqlen_q=chunk_size,
|
||||
max_seqlen_k=chunk_size,
|
||||
slot_mapping=slot_mapping,
|
||||
kvcache_manager=manager,
|
||||
chunked_seq=seq,
|
||||
current_chunk_idx=chunk_idx,
|
||||
)
|
||||
|
||||
# Forward through all layers
|
||||
output = hidden
|
||||
for layer in layers:
|
||||
k = torch.randn(chunk_size, NUM_KV_HEADS, HEAD_DIM, dtype=DTYPE, device=DEVICE)
|
||||
v = torch.randn(chunk_size, NUM_KV_HEADS, HEAD_DIM, dtype=DTYPE, device=DEVICE)
|
||||
output = layer.forward(output, k, v)
|
||||
|
||||
# Offload current chunk to CPU
|
||||
logical_id = seq.block_table[chunk_idx]
|
||||
cpu_block_id = manager.logical_blocks[logical_id].cpu_block_id
|
||||
manager.offload_engine.offload_slot_to_cpu(write_slot, cpu_block_id)
|
||||
manager.prefilled_blocks.add(logical_id)
|
||||
|
||||
return output
|
||||
|
||||
|
||||
# ============================================================
|
||||
# Main Test
|
||||
# ============================================================
|
||||
|
||||
print("=" * 60)
|
||||
print("Test: Attention Layer with KV Cache Offload")
|
||||
print("=" * 60)
|
||||
|
||||
# 1. Setup
|
||||
print("\n[1] Creating manager and attention layers...")
|
||||
manager = create_manager()
|
||||
layers = create_attention_layers(manager)
|
||||
print(f" - Manager: {NUM_GPU_SLOTS} GPU slots, {NUM_CPU_BLOCKS} CPU blocks")
|
||||
print(f" - Layers: {NUM_LAYERS} layers, {NUM_HEADS} heads, {HEAD_DIM} head_dim")
|
||||
print(f" - OffloadEngine initialized: {manager.offload_engine is not None}")
|
||||
|
||||
# 2. Setup
|
||||
print("\n[2] Test configuration...")
|
||||
NUM_CHUNKS = NUM_CPU_BLOCKS # Use all CPU blocks
|
||||
print(f" - Total tokens: {NUM_CHUNKS * CHUNK_SIZE}")
|
||||
print(f" - Chunks: {NUM_CHUNKS}")
|
||||
|
||||
# 3. Warmup runs
|
||||
print(f"\n[3] Warmup runs (3 iterations)...")
|
||||
for warmup_iter in range(3):
|
||||
manager.prefilled_blocks.clear()
|
||||
seq = create_test_sequence(manager, num_chunks=NUM_CHUNKS)
|
||||
|
||||
for chunk_idx in range(NUM_CHUNKS):
|
||||
write_slot = manager.offload_engine.get_write_slot_for_prefill(chunk_idx)
|
||||
output = simulate_chunk_forward(layers, manager, seq, chunk_idx, CHUNK_SIZE)
|
||||
|
||||
manager.deallocate(seq)
|
||||
print(f" - Warmup {warmup_iter + 1}/3 completed")
|
||||
|
||||
# 4. Benchmark runs
|
||||
print(f"\n[4] Benchmark runs (10 iterations)...")
|
||||
for bench_iter in range(10):
|
||||
manager.prefilled_blocks.clear()
|
||||
seq = create_test_sequence(manager, num_chunks=NUM_CHUNKS)
|
||||
|
||||
for chunk_idx in range(NUM_CHUNKS):
|
||||
write_slot = manager.offload_engine.get_write_slot_for_prefill(chunk_idx)
|
||||
load_slots = manager.offload_engine.get_load_slots_for_prefill(write_slot)
|
||||
output = simulate_chunk_forward(layers, manager, seq, chunk_idx, CHUNK_SIZE)
|
||||
|
||||
manager.deallocate(seq)
|
||||
print(f" - Iteration {bench_iter + 1}/10 completed")
|
||||
|
||||
# 5. Verify results (using last iteration's seq)
|
||||
print("\n[5] Verifying ring buffer and offload...")
|
||||
for chunk_idx in range(NUM_CHUNKS):
|
||||
expected_slot = chunk_idx % NUM_GPU_SLOTS
|
||||
actual_slot = manager.offload_engine.get_write_slot_for_prefill(chunk_idx)
|
||||
assert actual_slot == expected_slot, f"Chunk {chunk_idx}: expected slot {expected_slot}, got {actual_slot}"
|
||||
|
||||
cpu_block_table = manager.get_prefilled_cpu_blocks(seq)
|
||||
assert cpu_block_table == seq.block_table[:NUM_CHUNKS], "CPU block table mismatch"
|
||||
print(" - Ring buffer cycling verified ✓")
|
||||
print(" - CPU offload verified ✓")
|
||||
|
||||
# Cleanup
|
||||
manager.deallocate(seq)
|
||||
|
||||
# Cleanup
|
||||
reset_context()
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
print("test_attention_offload: PASSED")
|
||||
print("=" * 60)
|
||||
@@ -6,7 +6,7 @@ Author: Zijie Tian
|
||||
|
||||
import torch
|
||||
import time
|
||||
from nanovllm.comm import memcpy_2d, memcpy_2d_async
|
||||
from nanovllm.comm import memcpy_2d
|
||||
|
||||
# ============================================================
|
||||
# Configuration
|
||||
@@ -34,64 +34,12 @@ class Config:
|
||||
|
||||
|
||||
# ============================================================
|
||||
# Test 1: Async Transfer
|
||||
# ============================================================
|
||||
|
||||
def test_async_transfer():
|
||||
"""Test asynchronous transfer with CUDA stream."""
|
||||
print("\n[Test 1] Async Transfer Test")
|
||||
|
||||
cfg = Config()
|
||||
|
||||
# Create test data
|
||||
cpu_data = torch.randn(
|
||||
cfg.num_layers,
|
||||
cfg.num_blocks,
|
||||
cfg.features_per_block,
|
||||
dtype=cfg.dtype,
|
||||
pin_memory=True
|
||||
)
|
||||
gpu_buffer = torch.empty(
|
||||
cfg.num_layers,
|
||||
cfg.features_per_block,
|
||||
dtype=cfg.dtype,
|
||||
device='cuda'
|
||||
)
|
||||
|
||||
# Create CUDA stream
|
||||
stream = torch.cuda.Stream()
|
||||
|
||||
test_block_id = 5
|
||||
spitch = cfg.bytes_per_layer
|
||||
dpitch = cfg.bytes_per_block
|
||||
width = cfg.bytes_per_block
|
||||
height = cfg.num_layers
|
||||
|
||||
# Async transfer
|
||||
with torch.cuda.stream(stream):
|
||||
src_view = cpu_data[:, test_block_id, :]
|
||||
memcpy_2d_async(gpu_buffer, src_view, dpitch, spitch, width, height, "h2d", stream)
|
||||
|
||||
# Wait for completion
|
||||
stream.synchronize()
|
||||
|
||||
# Verify
|
||||
expected = cpu_data[:, test_block_id, :].cuda()
|
||||
if torch.allclose(gpu_buffer, expected, rtol=1e-3, atol=1e-3):
|
||||
print(" Result: PASSED ✓")
|
||||
return True
|
||||
else:
|
||||
print(" Result: FAILED ✗")
|
||||
return False
|
||||
|
||||
|
||||
# ============================================================
|
||||
# Test 2: Performance Benchmark
|
||||
# Performance Benchmark
|
||||
# ============================================================
|
||||
|
||||
def benchmark_sgdma():
|
||||
"""Benchmark cudaMemcpy2D vs standard PyTorch methods."""
|
||||
print("\n[Test 2] Performance Benchmark")
|
||||
print("\n=== Performance Benchmark ===")
|
||||
|
||||
cfg = Config()
|
||||
|
||||
@@ -212,19 +160,17 @@ def benchmark_sgdma():
|
||||
# ============================================================
|
||||
|
||||
if __name__ == "__main__":
|
||||
print("=== CUDA sgDMA (cudaMemcpy2D) Tests ===")
|
||||
print("=== CUDA sgDMA (cudaMemcpy2D) Benchmark ===")
|
||||
|
||||
# Check CUDA availability
|
||||
if not torch.cuda.is_available():
|
||||
print("CUDA not available. Skipping tests.")
|
||||
print("CUDA not available. Skipping benchmark.")
|
||||
exit(1)
|
||||
|
||||
# Print GPU info
|
||||
print(f"Using GPU: {torch.cuda.get_device_name()}")
|
||||
|
||||
# Run tests
|
||||
test1_passed = test_async_transfer()
|
||||
# Run benchmark
|
||||
benchmark_sgdma()
|
||||
|
||||
print("\n=== Tests Complete ===")
|
||||
print(f"All tests {'PASSED ✓' if test1_passed else 'FAILED ✗'}")
|
||||
print("\n=== Benchmark Complete ===")
|
||||
|
||||
Reference in New Issue
Block a user