[WIP] Before fix needle.

2025-12-31 23:35:25 +08:00
parent ccd1b3d4ab
commit 30462fe89a
5 changed files with 212 additions and 290 deletions
--- a/tests/test_debug_verification.py
+++ b/tests/test_debug_verification.py
@@ -29,10 +29,7 @@ def debug_load_hook(slot_idx: int, layer_id: int, cpu_block_id: int, k: Tensor,
    """Record loaded tensor values for layer 0."""
    if layer_id != 0:
        return
-    
-    if layer_id == 0:
-        __import__('pdb').set_trace()
-    
+
    load_log.append({
        "chunk_idx": current_chunk[0],
        "cpu_block_id": cpu_block_id,
--- a/tests/test_offload_correctness.py
+++ b/tests/test_offload_correctness.py
@@ -20,7 +20,6 @@ import torch
 from random import randint, seed
 from nanovllm import LLM, SamplingParams
 from nanovllm.utils.context import get_context
-from nanovllm.kvcache.debug_utils import dump_block_state


 # ============================================================
@@ -97,9 +96,9 @@ def make_verified_load_to_slot_layer(original_func, offload_engine):
        # cpu_block_id == chunk_idx in our sequential test
        expected_k, expected_v = get_expected_pattern(cpu_block_id)

-        # Read GPU slot data
-        gpu_k = offload_engine.k_cache_gpu[layer_id, slot_idx]
-        gpu_v = offload_engine.v_cache_gpu[layer_id, slot_idx]
+        # Read GPU slot data (GPU cache has no layer dimension)
+        gpu_k = offload_engine.k_cache_gpu[slot_idx]
+        gpu_v = offload_engine.v_cache_gpu[slot_idx]

        actual_k = gpu_k.float().mean().item()
        actual_v = gpu_v.float().mean().item()
@@ -306,9 +305,9 @@ def make_gpu_write_verification_post_hook(layer_id: int):
        # Get expected pattern for current chunk
        expected_k, expected_v = get_expected_pattern(chunk_idx)

-        # Verify write_slot contains current chunk's data
-        gpu_k = oe.k_cache_gpu[layer_id, write_slot]
-        gpu_v = oe.v_cache_gpu[layer_id, write_slot]
+        # Verify write_slot contains current chunk's data (GPU cache has no layer dimension)
+        gpu_k = oe.k_cache_gpu[write_slot]
+        gpu_v = oe.v_cache_gpu[write_slot]

        actual_k_mean = gpu_k.float().mean().item()
        actual_v_mean = gpu_v.float().mean().item()
@@ -419,9 +418,9 @@ def make_post_chunk_verification_hook(layer_id: int):

        expected_k, expected_v = get_expected_pattern(chunk_idx)

-        # Check GPU ring buffer
-        gpu_k = oe.k_cache_gpu[layer_id, ring_slot]
-        gpu_v = oe.v_cache_gpu[layer_id, ring_slot]
+        # Check GPU ring buffer (GPU cache has no layer dimension)
+        gpu_k = oe.k_cache_gpu[ring_slot]
+        gpu_v = oe.v_cache_gpu[ring_slot]

        k_ok, k_err = check_pattern(gpu_k, expected_k, f"GPU K slot {ring_slot}")
        v_ok, v_err = check_pattern(gpu_v, expected_v, f"GPU V slot {ring_slot}")