From 18bc433f09b72c6a45d737e1c2a89b38bd72480d Mon Sep 17 00:00:00 2001
From: Zijie Tian <zijietian@mail.xmu.edu.cn>
Date: Tue, 27 Jan 2026 03:42:05 +0800
Subject: [PATCH] =?UTF-8?q?=E2=9A=A1=20perf:=20improve=20NVTX=20profiling?=
 =?UTF-8?q?=20with=20colored=20ranges=20and=20configurable=20slots?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Switch from torch.cuda.nvtx to nvtx package for colored range support
- Add color coding: blue for H2D, green for D2H decode, orange for D2H prefill
- Add --num-gpu-blocks parameter to profile_offload.sh
- Include slot count in output filename for easier comparison

Generated with [Claude Code](https://claude.ai/code)
via [Happy](https://happy.engineering)

Co-Authored-By: Claude <noreply@anthropic.com>
Co-Authored-By: Happy <yesreply@happy.engineering>
---
 nanovllm/kvcache/offload_engine.py | 16 ++++++++++------
 scripts/profile_offload.sh         | 12 +++++++++++-
 2 files changed, 21 insertions(+), 7 deletions(-)

diff --git a/nanovllm/kvcache/offload_engine.py b/nanovllm/kvcache/offload_engine.py
index e860daf..bcd832d 100644
--- a/nanovllm/kvcache/offload_engine.py
+++ b/nanovllm/kvcache/offload_engine.py
@@ -9,6 +9,7 @@ Key design principles for CUDA Graph compatibility:
 
 import torch
 import torch.cuda.nvtx
+import nvtx
 from torch import Tensor
 from typing import Dict, List, Tuple, Optional
 from dataclasses import dataclass
@@ -403,7 +404,8 @@ class OffloadEngine:
             nvtx_label = f"H2D: L{layer_id} Chunk{chunk_idx} CPU[{cpu_block_id}]->Slot[{slot_idx}]"
         else:
             nvtx_label = f"H2D: L{layer_id} CPU[{cpu_block_id}]->Slot[{slot_idx}]"
-        torch.cuda.nvtx.range_push(nvtx_label)
+
+        nvtx.push_range(message=nvtx_label, color="blue")
         with torch.cuda.stream(stream):
             # Wait for previous compute on this slot to complete before overwriting
             # This prevents data race: transfer must not start until attention finishes reading
@@ -421,7 +423,7 @@ class OffloadEngine:
                 self.v_cache_cpu[layer_id, cpu_block_id], non_blocking=True
             )
             self.ring_slot_ready[slot_idx].record(stream)
-        torch.cuda.nvtx.range_pop()
+        nvtx.pop_range()
 
     def wait_slot_layer(self, slot_idx: int) -> None:
         """
@@ -478,7 +480,8 @@ class OffloadEngine:
             else:
                 self.sparse_policy.on_decode_offload(cpu_block_id, layer_id, k_cache, valid_tokens)
 
-        torch.cuda.nvtx.range_push(f"D2H: Slot[{slot_idx}]->CPU[L{layer_id},B{cpu_block_id}]")
+        nvtx_label = f"D2H: Slot[{slot_idx}]->CPU[L{layer_id},B{cpu_block_id}]"
+        nvtx.push_range(message=nvtx_label, color="green")
         with torch.cuda.stream(self.transfer_stream_main):
             # Wait for both compute_stream and default stream
             # - compute_stream: for flash attention operations
@@ -494,7 +497,7 @@ class OffloadEngine:
                 self.v_cache_gpu[slot_idx], non_blocking=True
             )
             self.ring_slot_offload_done[slot_idx].record(self.transfer_stream_main)
-        torch.cuda.nvtx.range_pop()
+        nvtx.pop_range()
 
     # ----- KV access methods for ring buffer -----
 
@@ -792,7 +795,8 @@ class OffloadEngine:
         # Use per-layer stream for parallel offloads
         stream = self.prefill_offload_streams[layer_id]
 
-        torch.cuda.nvtx.range_push(f"AsyncPrefillOffload: L{layer_id}->CPU[{cpu_block_id}]")
+        nvtx_label = f"D2H: PrefillBuffer L{layer_id}->CPU[{cpu_block_id}]"
+        nvtx.push_range(message=nvtx_label, color="orange")
         with torch.cuda.stream(stream):
             # Wait for compute to finish writing to prefill buffer
             stream.wait_stream(self.compute_stream)
@@ -807,7 +811,7 @@ class OffloadEngine:
 
             # Record completion event
             self.prefill_offload_events[layer_id].record(stream)
-        torch.cuda.nvtx.range_pop()
+        nvtx.pop_range()
 
     def wait_all_prefill_offloads(self) -> None:
         """Wait for all prefill buffer offloads to complete."""
diff --git a/scripts/profile_offload.sh b/scripts/profile_offload.sh
index 4db42e0..86d7d80 100755
--- a/scripts/profile_offload.sh
+++ b/scripts/profile_offload.sh
@@ -9,6 +9,7 @@
 #   --dataset DATASET    Task name (default: niah_single_1)
 #   --sample INDEX       Sample index (default: 0)
 #   --gpu GPU_ID         GPU to use (default: 0)
+#   --num-gpu-blocks N   Number of GPU blocks/slots (default: 4)
 #   --no-offload         Disable CPU offload
 #
 # Output:
@@ -18,6 +19,7 @@
 #   bash scripts/profile_offload.sh
 #   bash scripts/profile_offload.sh --dataset niah_single_1 --sample 5
 #   bash scripts/profile_offload.sh --gpu 1 --no-offload
+#   bash scripts/profile_offload.sh --num-gpu-blocks 8
 
 set -e
 
@@ -25,6 +27,7 @@ set -e
 DATASET="niah_single_1"
 SAMPLE_INDEX="0"
 GPU_ID="0"
+NUM_GPU_BLOCKS="4"
 ENABLE_OFFLOAD="--enable-offload"
 
 # Parse arguments
@@ -46,6 +49,10 @@ while [[ $# -gt 0 ]]; do
             ENABLE_OFFLOAD=""
             shift
             ;;
+        --num-gpu-blocks)
+            NUM_GPU_BLOCKS="$2"
+            shift 2
+            ;;
         -h|--help)
             echo "Usage: $0 [options]"
             echo ""
@@ -54,6 +61,7 @@ while [[ $# -gt 0 ]]; do
             echo "  --sample INDEX       Sample index (default: 0)"
             echo "  --gpu GPU_ID         GPU to use (default: 0)"
             echo "  --no-offload         Disable CPU offload"
+            echo "  --num-gpu-blocks N   Number of GPU blocks/slots (default: 4)"
             exit 0
             ;;
         *)
@@ -76,7 +84,7 @@ mkdir -p "$OUTPUT_DIR"
 TIMESTAMP=$(date +%Y%m%d_%H%M%S)
 OFFLOAD_SUFFIX=""
 if [ -n "$ENABLE_OFFLOAD" ]; then
-    OFFLOAD_SUFFIX="_offload"
+    OFFLOAD_SUFFIX="_offload_${NUM_GPU_BLOCKS}slots"
 fi
 OUTPUT_FILE="$OUTPUT_DIR/ruler_${DATASET}_sample${SAMPLE_INDEX}${OFFLOAD_SUFFIX}_${TIMESTAMP}"
 
@@ -87,6 +95,7 @@ echo "Test script: $TEST_SCRIPT"
 echo "Dataset:     $DATASET"
 echo "Sample:      $SAMPLE_INDEX"
 echo "GPU:         $GPU_ID"
+echo "GPU Blocks:  $NUM_GPU_BLOCKS"
 echo "Offload:     ${ENABLE_OFFLOAD:-disabled}"
 echo "Output file: $OUTPUT_FILE.nsys-rep"
 echo ""
@@ -109,6 +118,7 @@ nsys profile \
     python "$TEST_SCRIPT" \
         --datasets "$DATASET" \
         --sample-indices "$SAMPLE_INDEX" \
+        --num-gpu-blocks "$NUM_GPU_BLOCKS" \
         $ENABLE_OFFLOAD \
         --quiet