[feat] Added chunked prefill and kvcache offload mechenism.

2025-12-10 03:47:37 +08:00
parent 204fe2b38f
commit 0b6f19242d
25 changed files with 4414 additions and 61 deletions
--- a/tests/init.py
+++ b/tests/init.py
@@ -0,0 +1 @@
+"""Test suite for nano-vllm KV cache offload."""
--- a/tests/test_kernels.py
+++ b/tests/test_kernels.py
@@ -0,0 +1,169 @@
+"""Tests for Triton gathered copy kernels."""
+
+import pytest
+import torch
+
+from nanovllm.kvcache.kernels import gathered_copy, gathered_copy_kv
+
+
+class TestGatheredCopy:
+    """Tests for gathered copy kernel."""
+
+    @pytest.fixture
+    def setup_tensors(self):
+        """Create test tensors."""
+        torch.cuda.manual_seed(42)
+        num_src_blocks = 16
+        num_dst_blocks = 8
+        block_size = 256
+        kv_dim = 64
+
+        src = torch.randn(num_src_blocks, block_size, kv_dim,
+                          dtype=torch.float16, device="cuda")
+        dst = torch.zeros(num_dst_blocks, block_size, kv_dim,
+                          dtype=torch.float16, device="cuda")
+
+        # Indices: dst[i] = src[indices[i]]
+        indices = torch.randint(0, num_src_blocks, (num_dst_blocks,),
+                                dtype=torch.int64, device="cuda")
+
+        return src, dst, indices
+
+    def test_basic_copy(self, setup_tensors):
+        """Test basic gathered copy."""
+        src, dst, indices = setup_tensors
+
+        gathered_copy(src, dst, indices)
+
+        # Verify copy
+        for i in range(len(indices)):
+            src_idx = indices[i].item()
+            assert torch.allclose(dst[i], src[src_idx]), f"Mismatch at index {i}"
+
+    def test_skip_negative_indices(self, setup_tensors):
+        """Test that negative indices are skipped."""
+        src, dst, indices = setup_tensors
+
+        # Set some indices to -1
+        indices[2] = -1
+        indices[5] = -1
+
+        # Fill dst with a known value
+        dst.fill_(999.0)
+
+        gathered_copy(src, dst, indices)
+
+        # Skipped slots should be unchanged
+        assert (dst[2] == 999.0).all()
+        assert (dst[5] == 999.0).all()
+
+        # Non-skipped slots should be copied
+        for i in [0, 1, 3, 4, 6, 7]:
+            src_idx = indices[i].item()
+            assert torch.allclose(dst[i], src[src_idx])
+
+    def test_single_block(self):
+        """Test copying a single block."""
+        src = torch.randn(4, 256, 64, dtype=torch.float16, device="cuda")
+        dst = torch.zeros(1, 256, 64, dtype=torch.float16, device="cuda")
+        indices = torch.tensor([2], dtype=torch.int64, device="cuda")
+
+        gathered_copy(src, dst, indices)
+
+        assert torch.allclose(dst[0], src[2])
+
+
+class TestGatheredCopyKV:
+    """Tests for gathered K/V cache copy kernel."""
+
+    @pytest.fixture
+    def setup_kv_tensors(self):
+        """Create K/V test tensors."""
+        torch.cuda.manual_seed(42)
+        num_src_blocks = 16
+        num_dst_blocks = 8
+        block_size = 256
+        num_kv_heads = 4
+        head_dim = 64
+
+        k_src = torch.randn(num_src_blocks, block_size, num_kv_heads, head_dim,
+                            dtype=torch.float16, device="cuda")
+        v_src = torch.randn(num_src_blocks, block_size, num_kv_heads, head_dim,
+                            dtype=torch.float16, device="cuda")
+        k_dst = torch.zeros(num_dst_blocks, block_size, num_kv_heads, head_dim,
+                            dtype=torch.float16, device="cuda")
+        v_dst = torch.zeros(num_dst_blocks, block_size, num_kv_heads, head_dim,
+                            dtype=torch.float16, device="cuda")
+
+        indices = torch.randint(0, num_src_blocks, (num_dst_blocks,),
+                                dtype=torch.int64, device="cuda")
+
+        return k_src, v_src, k_dst, v_dst, indices
+
+    def test_kv_copy(self, setup_kv_tensors):
+        """Test K/V gathered copy."""
+        k_src, v_src, k_dst, v_dst, indices = setup_kv_tensors
+
+        gathered_copy_kv(k_src, v_src, k_dst, v_dst, indices)
+
+        # Verify copy
+        for i in range(len(indices)):
+            src_idx = indices[i].item()
+            assert torch.allclose(k_dst[i], k_src[src_idx]), f"K mismatch at {i}"
+            assert torch.allclose(v_dst[i], v_src[src_idx]), f"V mismatch at {i}"
+
+    def test_kv_skip_negative(self, setup_kv_tensors):
+        """Test that negative indices are skipped for K/V."""
+        k_src, v_src, k_dst, v_dst, indices = setup_kv_tensors
+
+        indices[0] = -1
+        k_dst.fill_(999.0)
+        v_dst.fill_(999.0)
+
+        gathered_copy_kv(k_src, v_src, k_dst, v_dst, indices)
+
+        assert (k_dst[0] == 999.0).all()
+        assert (v_dst[0] == 999.0).all()
+
+
+class TestPerformance:
+    """Performance benchmarks for gathered copy."""
+
+    @pytest.mark.parametrize("num_blocks", [8, 32, 128])
+    def test_throughput(self, num_blocks):
+        """Benchmark copy throughput."""
+        block_size = 256
+        kv_dim = 64
+
+        src = torch.randn(num_blocks * 2, block_size, kv_dim,
+                          dtype=torch.float16, device="cuda")
+        dst = torch.zeros(num_blocks, block_size, kv_dim,
+                          dtype=torch.float16, device="cuda")
+        indices = torch.arange(num_blocks, dtype=torch.int64, device="cuda")
+
+        # Warmup
+        for _ in range(10):
+            gathered_copy(src, dst, indices)
+        torch.cuda.synchronize()
+
+        # Benchmark
+        import time
+        start = time.perf_counter()
+        num_iters = 100
+        for _ in range(num_iters):
+            gathered_copy(src, dst, indices)
+        torch.cuda.synchronize()
+        elapsed = time.perf_counter() - start
+
+        bytes_copied = num_blocks * block_size * kv_dim * 2 * num_iters  # fp16
+        bandwidth_gbps = bytes_copied / elapsed / 1e9
+
+        print(f"\n{num_blocks} blocks: {bandwidth_gbps:.2f} GB/s")
+
+        # Should achieve reasonable bandwidth (lower threshold for small blocks due to kernel launch overhead)
+        min_bandwidth = 5 if num_blocks <= 16 else 10
+        assert bandwidth_gbps > min_bandwidth, f"Bandwidth too low: {bandwidth_gbps} GB/s"
+
+
+if __name__ == "__main__":
+    pytest.main([__file__, "-v"])
--- a/tests/test_kvcache_manager.py
+++ b/tests/test_kvcache_manager.py
@@ -0,0 +1,175 @@
+"""Tests for KV cache managers."""
+
+import pytest
+import torch
+
+from nanovllm.engine.sequence import Sequence
+from nanovllm.kvcache.gpu_manager import GPUOnlyManager
+
+
+class MockSequence:
+    """Mock sequence for testing block allocation."""
+
+    def __init__(self, token_ids: list[int], block_size: int = 256):
+        self._token_ids = token_ids
+        self._block_size = block_size
+        self.block_table: list[int] = []
+        self.num_cached_tokens = 0
+
+    def __len__(self):
+        return len(self._token_ids)
+
+    @property
+    def num_blocks(self) -> int:
+        return (len(self) + self._block_size - 1) // self._block_size
+
+    def block(self, i: int) -> list[int]:
+        start = i * self._block_size
+        end = min((i + 1) * self._block_size, len(self))
+        return self._token_ids[start:end]
+
+
+class TestGPUOnlyManager:
+    """Tests for GPU-only KV cache manager."""
+
+    @pytest.fixture
+    def manager(self):
+        """Create a small manager for testing."""
+        return GPUOnlyManager(num_blocks=16, block_size=256)
+
+    def test_initialization(self, manager):
+        """Test manager initialization."""
+        assert manager.block_size == 256
+        assert manager.num_free_blocks == 16
+        assert len(manager.blocks) == 16
+
+    def test_allocate_cache(self, manager):
+        """Test cache allocation."""
+        manager.allocate_cache(
+            num_layers=4,
+            num_kv_heads=8,
+            head_dim=64,
+            dtype=torch.float16,
+        )
+
+        assert manager.kv_cache is not None
+        assert manager.kv_cache.shape == (2, 4, 16, 256, 8, 64)
+        assert manager.kv_cache.device.type == "cuda"
+
+    def test_get_layer_cache(self, manager):
+        """Test getting layer cache."""
+        manager.allocate_cache(
+            num_layers=4,
+            num_kv_heads=8,
+            head_dim=64,
+            dtype=torch.float16,
+        )
+
+        k_cache, v_cache = manager.get_layer_cache(0)
+        assert k_cache.shape == (16, 256, 8, 64)
+        assert v_cache.shape == (16, 256, 8, 64)
+
+    def test_can_allocate(self, manager):
+        """Test allocation check."""
+        seq = MockSequence([0] * 300)  # Needs 2 blocks
+        assert manager.can_allocate(seq)
+
+        # Fill up all blocks with unique tokens to avoid prefix caching
+        for i in range(8):
+            # Each sequence has unique tokens to prevent prefix cache hits
+            s = MockSequence([i * 1000 + j for j in range(300)])
+            manager.allocate(s)
+
+        # Now should not be able to allocate
+        new_seq = MockSequence([9999] * 300)
+        assert not manager.can_allocate(new_seq)
+
+    def test_allocate_and_deallocate(self, manager):
+        """Test block allocation and deallocation."""
+        seq = MockSequence([0] * 600)  # Needs 3 blocks
+        initial_free = manager.num_free_blocks
+
+        manager.allocate(seq)
+        assert len(seq.block_table) == 3
+        assert manager.num_free_blocks == initial_free - 3
+
+        manager.deallocate(seq)
+        assert len(seq.block_table) == 0
+        assert manager.num_free_blocks == initial_free
+
+    def test_can_append(self, manager):
+        """Test append check."""
+        seq = MockSequence([0] * 256)  # Exactly 1 block
+        manager.allocate(seq)
+
+        # Can append without new block (still in same block)
+        seq._token_ids = [0] * 257
+        assert manager.can_append(seq)
+
+    def test_prepare_for_attention_noop(self, manager):
+        """Test that prepare_for_attention is a no-op for GPU-only."""
+        seq = MockSequence([0] * 100)
+        manager.allocate(seq)
+
+        # Should not raise
+        manager.prepare_for_attention([seq], is_prefill=True)
+        manager.prepare_for_attention([seq], is_prefill=False)
+
+    def test_get_gpu_block_tables(self, manager):
+        """Test getting GPU block tables."""
+        seq1 = MockSequence([0] * 300)
+        seq2 = MockSequence([0] * 600)
+
+        manager.allocate(seq1)
+        manager.allocate(seq2)
+
+        tables = manager.get_gpu_block_tables([seq1, seq2])
+
+        assert len(tables) == 2
+        assert tables[0] == list(seq1.block_table)
+        assert tables[1] == list(seq2.block_table)
+
+
+class TestGPUOnlyManagerPrefixCaching:
+    """Tests for prefix caching in GPU-only manager."""
+
+    @pytest.fixture
+    def manager(self):
+        """Create manager for testing."""
+        return GPUOnlyManager(num_blocks=32, block_size=256)
+
+    def test_prefix_cache_hit(self, manager):
+        """Test that identical prefixes are cached."""
+        # Create two sequences with same prefix
+        tokens = list(range(512))  # 2 full blocks
+        seq1 = MockSequence(tokens)
+        seq2 = MockSequence(tokens)
+
+        manager.allocate(seq1)
+        initial_free = manager.num_free_blocks
+
+        manager.allocate(seq2)
+
+        # Second sequence should reuse cached blocks
+        assert seq2.num_cached_tokens >= 256  # At least first block cached
+        # Should use fewer new blocks
+        assert manager.num_free_blocks >= initial_free - 2
+
+    def test_prefix_cache_different_suffix(self, manager):
+        """Test cache with same prefix but different suffix."""
+        prefix = list(range(256))  # 1 full block
+
+        seq1 = MockSequence(prefix + [1000, 1001])
+        seq2 = MockSequence(prefix + [2000, 2001])
+
+        manager.allocate(seq1)
+        manager.allocate(seq2)
+
+        # First block should be shared
+        assert seq1.block_table[0] == seq2.block_table[0]
+        # Second block should be different
+        assert seq1.block_table[1] != seq2.block_table[1]
+
+
+if __name__ == "__main__":
+    pytest.main([__file__, "-v"])
--- a/tests/test_offload_engine.py
+++ b/tests/test_offload_engine.py
@@ -0,0 +1,196 @@
+"""Tests for CPU-GPU offload engine."""
+
+import pytest
+import torch
+
+from nanovllm.kvcache.offload_engine import OffloadEngine
+
+
+class TestOffloadEngine:
+    """Tests for OffloadEngine."""
+
+    @pytest.fixture
+    def engine(self):
+        """Create a small engine for testing."""
+        return OffloadEngine(
+            num_layers=2,
+            num_gpu_blocks=4,
+            num_cpu_blocks=8,
+            block_size=256,
+            num_kv_heads=4,
+            head_dim=64,
+            dtype=torch.float16,
+            num_streams=2,
+        )
+
+    def test_initialization(self, engine):
+        """Test engine initialization."""
+        # Check GPU cache shape
+        assert engine.k_cache_gpu.shape == (2, 4, 256, 4, 64)
+        assert engine.v_cache_gpu.shape == (2, 4, 256, 4, 64)
+
+        # Check CPU cache shape
+        assert engine.k_cache_cpu.shape == (2, 8, 256, 4, 64)
+        assert engine.v_cache_cpu.shape == (2, 8, 256, 4, 64)
+
+        # Check pinned memory
+        assert engine.k_cache_cpu.is_pinned()
+        assert engine.v_cache_cpu.is_pinned()
+
+        # Check gather indices
+        assert engine.gather_indices_cpu.shape == (2, 4)
+        assert engine.gather_indices_gpu.shape == (2, 4)
+
+    def test_get_layer_cache(self, engine):
+        """Test getting layer cache."""
+        k, v = engine.get_layer_cache(0)
+        assert k.shape == (4, 256, 4, 64)
+        assert v.shape == (4, 256, 4, 64)
+        assert k.device.type == "cuda"
+        assert v.device.type == "cuda"
+
+    def test_prefetch_and_offload(self, engine):
+        """Test async prefetch and offload."""
+        # Write some data to CPU block 0
+        engine.k_cache_cpu[0, 0].fill_(1.0)
+        engine.v_cache_cpu[0, 0].fill_(2.0)
+
+        # Prefetch to GPU block 2
+        event = engine.prefetch_block_async(
+            layer_id=0,
+            cpu_block_id=0,
+            gpu_block_id=2,
+        )
+        event.synchronize()
+
+        # Verify data was copied (move GPU to CPU for comparison)
+        assert torch.allclose(engine.k_cache_gpu[0, 2].cpu(), engine.k_cache_cpu[0, 0])
+        assert torch.allclose(engine.v_cache_gpu[0, 2].cpu(), engine.v_cache_cpu[0, 0])
+
+        # Modify GPU data
+        engine.k_cache_gpu[0, 2].fill_(3.0)
+        engine.v_cache_gpu[0, 2].fill_(4.0)
+
+        # Offload to CPU block 5
+        event = engine.offload_block_async(
+            layer_id=0,
+            gpu_block_id=2,
+            cpu_block_id=5,
+        )
+        event.synchronize()
+
+        # Verify data was copied
+        assert torch.allclose(engine.k_cache_cpu[0, 5], engine.k_cache_gpu[0, 2].cpu())
+        assert torch.allclose(engine.v_cache_cpu[0, 5], engine.v_cache_gpu[0, 2].cpu())
+
+    def test_update_gather_indices(self, engine):
+        """Test updating gather indices."""
+        # Manually set CPU data
+        for i in range(8):
+            engine.k_cache_cpu[0, i].fill_(float(i))
+            engine.v_cache_cpu[0, i].fill_(float(i + 100))
+
+        # Update indices for layer 0: (cpu_block_id, gpu_slot)
+        mappings = [(2, 0), (5, 1), (1, 2), (7, 3)]
+        engine.update_gather_indices(layer_id=0, mappings=mappings)
+        torch.cuda.synchronize()
+
+        # Verify indices were set
+        expected = torch.tensor([2, 5, 1, 7], dtype=torch.int64)
+        assert torch.equal(engine.gather_indices_cpu[0], expected)
+
+    def test_gathered_h2d_layer(self, engine):
+        """Test gathered H2D copy for a layer."""
+        # Set up CPU data with known values
+        for i in range(8):
+            engine.k_cache_cpu[0, i].fill_(float(i))
+            engine.v_cache_cpu[0, i].fill_(float(i + 100))
+
+        # Set gather indices: (cpu_block_id, gpu_slot)
+        # GPU slot 0 gets CPU block 3, GPU slot 1 gets CPU block 0, etc.
+        mappings = [(3, 0), (0, 1), (7, 2), (2, 3)]
+        engine.update_gather_indices(layer_id=0, mappings=mappings)
+        torch.cuda.synchronize()
+
+        # Execute gathered H2D
+        engine.gathered_h2d_layer(layer_id=0)
+        torch.cuda.synchronize()
+
+        # Verify: GPU slot 0 should have CPU block 3's data
+        assert torch.allclose(engine.k_cache_gpu[0, 0],
+                              torch.full_like(engine.k_cache_gpu[0, 0], 3.0))
+        # GPU slot 1 should have CPU block 0's data
+        assert torch.allclose(engine.k_cache_gpu[0, 1],
+                              torch.full_like(engine.k_cache_gpu[0, 1], 0.0))
+        # GPU slot 2 should have CPU block 7's data
+        assert torch.allclose(engine.k_cache_gpu[0, 2],
+                              torch.full_like(engine.k_cache_gpu[0, 2], 7.0))
+        # GPU slot 3 should have CPU block 2's data
+        assert torch.allclose(engine.k_cache_gpu[0, 3],
+                              torch.full_like(engine.k_cache_gpu[0, 3], 2.0))
+
+    def test_multi_layer_independence(self, engine):
+        """Test that layers are independent."""
+        # Set different data for each layer
+        engine.k_cache_cpu[0, 0].fill_(1.0)
+        engine.k_cache_cpu[1, 0].fill_(2.0)
+
+        # Prefetch layer 0
+        event = engine.prefetch_block_async(0, 0, 0)
+        event.synchronize()
+
+        # Verify only layer 0 was affected
+        assert torch.allclose(engine.k_cache_gpu[0, 0],
+                              torch.full_like(engine.k_cache_gpu[0, 0], 1.0))
+        # Layer 1 should be zeros (initial state)
+        assert not torch.allclose(engine.k_cache_gpu[1, 0],
+                                  torch.full_like(engine.k_cache_gpu[1, 0], 2.0))
+
+
+class TestOffloadEngineFixedAddresses:
+    """Tests verifying fixed address property for CUDA Graph compatibility."""
+
+    @pytest.fixture
+    def engine(self):
+        """Create engine for address tests."""
+        return OffloadEngine(
+            num_layers=2,
+            num_gpu_blocks=4,
+            num_cpu_blocks=8,
+            block_size=256,
+            num_kv_heads=4,
+            head_dim=64,
+            dtype=torch.float16,
+            num_streams=2,
+        )
+
+    def test_gpu_cache_address_fixed(self, engine):
+        """Verify GPU cache addresses don't change."""
+        k_ptr_before = engine.k_cache_gpu.data_ptr()
+        v_ptr_before = engine.v_cache_gpu.data_ptr()
+
+        # Perform some operations - mappings is List[(cpu_block_id, gpu_slot)]
+        mappings = [(0, 0), (1, 1), (2, 2), (3, 3)]
+        engine.update_gather_indices(0, mappings)
+        engine.gathered_h2d_layer(0)
+        torch.cuda.synchronize()
+
+        # Addresses should be the same
+        assert engine.k_cache_gpu.data_ptr() == k_ptr_before
+        assert engine.v_cache_gpu.data_ptr() == v_ptr_before
+
+    def test_gather_indices_gpu_address_fixed(self, engine):
+        """Verify gather indices GPU tensor address doesn't change."""
+        ptr_before = engine.gather_indices_gpu.data_ptr()
+
+        # Update indices multiple times - mappings is List[(cpu_block_id, gpu_slot)]
+        mappings = [(0, 0), (1, 1), (2, 2), (3, 3)]
+        for _ in range(10):
+            engine.update_gather_indices(0, mappings)
+        torch.cuda.synchronize()
+
+        assert engine.gather_indices_gpu.data_ptr() == ptr_before
+
+
+if __name__ == "__main__":
+    pytest.main([__file__, "-v"])
--- a/tests/test_policies.py
+++ b/tests/test_policies.py
@@ -0,0 +1,167 @@
+"""Tests for eviction policies."""
+
+import pytest
+from nanovllm.kvcache.policies.lru_policy import LRUPolicy
+from nanovllm.kvcache.policies.fifo_policy import FIFOPolicy
+from nanovllm.kvcache.policies import get_policy
+
+
+class TestLRUPolicy:
+    """Tests for LRU eviction policy."""
+
+    def test_basic_eviction(self):
+        """Test that LRU evicts least recently used block."""
+        policy = LRUPolicy()
+
+        # Allocate blocks 0, 1, 2 in order
+        policy.on_block_allocated(0, step=1)
+        policy.on_block_allocated(1, step=2)
+        policy.on_block_allocated(2, step=3)
+
+        # Access block 0 (makes it most recently used)
+        policy.on_block_access(0, step=4)
+
+        # Should evict block 1 (least recently used)
+        candidates = {0, 1, 2}
+        victim = policy.select_victim(candidates)
+        assert victim == 1, f"Expected block 1, got {victim}"
+
+    def test_access_updates_order(self):
+        """Test that access updates LRU order."""
+        policy = LRUPolicy()
+
+        policy.on_block_allocated(0, step=1)
+        policy.on_block_allocated(1, step=2)
+        policy.on_block_allocated(2, step=3)
+
+        # Access all in reverse order
+        policy.on_block_access(2, step=4)
+        policy.on_block_access(1, step=5)
+        policy.on_block_access(0, step=6)
+
+        # Block 2 is now LRU (accessed earliest after allocation update)
+        candidates = {0, 1, 2}
+        victim = policy.select_victim(candidates)
+        assert victim == 2, f"Expected block 2, got {victim}"
+
+    def test_eviction_removes_from_tracking(self):
+        """Test that evicted blocks are removed from tracking."""
+        policy = LRUPolicy()
+
+        policy.on_block_allocated(0, step=1)
+        policy.on_block_allocated(1, step=2)
+
+        policy.on_block_evicted(0)
+
+        # Only block 1 should be a candidate
+        candidates = {0, 1}
+        victim = policy.select_victim(candidates)
+        assert victim == 1, "Should select block 1 since 0 was evicted"
+
+    def test_batch_eviction_order(self):
+        """Test get_eviction_order returns blocks in LRU order."""
+        policy = LRUPolicy()
+
+        for i in range(5):
+            policy.on_block_allocated(i, step=i)
+
+        # Access blocks 2 and 4
+        policy.on_block_access(2, step=10)
+        policy.on_block_access(4, step=11)
+
+        candidates = {0, 1, 2, 3, 4}
+        order = policy.get_eviction_order(candidates, count=3)
+
+        # Should be 0, 1, 3 (in that order, skipping 2 and 4 until needed)
+        assert order == [0, 1, 3], f"Expected [0, 1, 3], got {order}"
+
+
+class TestFIFOPolicy:
+    """Tests for FIFO eviction policy."""
+
+    def test_basic_eviction(self):
+        """Test that FIFO evicts oldest allocated block."""
+        policy = FIFOPolicy()
+
+        policy.on_block_allocated(0, step=1)
+        policy.on_block_allocated(1, step=2)
+        policy.on_block_allocated(2, step=3)
+
+        # Access doesn't change FIFO order
+        policy.on_block_access(0, step=4)
+
+        candidates = {0, 1, 2}
+        victim = policy.select_victim(candidates)
+        assert victim == 0, f"Expected block 0 (oldest), got {victim}"
+
+    def test_access_does_not_update_order(self):
+        """Test that FIFO ignores access patterns."""
+        policy = FIFOPolicy()
+
+        policy.on_block_allocated(0, step=1)
+        policy.on_block_allocated(1, step=2)
+        policy.on_block_allocated(2, step=3)
+
+        # Multiple accesses to block 0
+        for i in range(10):
+            policy.on_block_access(0, step=10 + i)
+
+        # Block 0 should still be evicted first (FIFO order)
+        candidates = {0, 1, 2}
+        victim = policy.select_victim(candidates)
+        assert victim == 0, f"Expected block 0, got {victim}"
+
+    def test_prefetch_resets_order(self):
+        """Test that prefetch moves block to end of queue."""
+        policy = FIFOPolicy()
+
+        policy.on_block_allocated(0, step=1)
+        policy.on_block_allocated(1, step=2)
+        policy.on_block_allocated(2, step=3)
+
+        # Prefetch block 0 (moves to end)
+        policy.on_block_prefetched(0, step=4)
+
+        candidates = {0, 1, 2}
+        victim = policy.select_victim(candidates)
+        assert victim == 1, f"Expected block 1 (now oldest), got {victim}"
+
+    def test_batch_eviction_order(self):
+        """Test get_eviction_order returns blocks in FIFO order."""
+        policy = FIFOPolicy()
+
+        for i in range(5):
+            policy.on_block_allocated(i, step=i)
+
+        candidates = {0, 1, 2, 3, 4}
+        order = policy.get_eviction_order(candidates, count=3)
+
+        assert order == [0, 1, 2], f"Expected [0, 1, 2], got {order}"
+
+
+class TestGetPolicy:
+    """Tests for policy factory function."""
+
+    def test_get_lru(self):
+        """Test getting LRU policy by name."""
+        policy = get_policy("lru")
+        assert isinstance(policy, LRUPolicy)
+
+    def test_get_fifo(self):
+        """Test getting FIFO policy by name."""
+        policy = get_policy("fifo")
+        assert isinstance(policy, FIFOPolicy)
+
+    def test_get_by_class_path(self):
+        """Test getting policy by full class path."""
+        policy = get_policy("nanovllm.kvcache.policies.lru_policy.LRUPolicy")
+        assert isinstance(policy, LRUPolicy)
+
+    def test_invalid_policy_name(self):
+        """Test that invalid policy name raises error."""
+        with pytest.raises((ValueError, ImportError)):
+            get_policy("invalid_policy")
+
+
+if __name__ == "__main__":
+    pytest.main([__file__, "-v"])
				`@@ -0,0 +1 @@`
				`"""Test suite for nano-vllm KV cache offload."""`