From 18bc433f09b72c6a45d737e1c2a89b38bd72480d Mon Sep 17 00:00:00 2001 From: Zijie Tian Date: Tue, 27 Jan 2026 03:42:05 +0800 Subject: [PATCH] =?UTF-8?q?=E2=9A=A1=20perf:=20improve=20NVTX=20profiling?= =?UTF-8?q?=20with=20colored=20ranges=20and=20configurable=20slots?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Switch from torch.cuda.nvtx to nvtx package for colored range support - Add color coding: blue for H2D, green for D2H decode, orange for D2H prefill - Add --num-gpu-blocks parameter to profile_offload.sh - Include slot count in output filename for easier comparison Generated with [Claude Code](https://claude.ai/code) via [Happy](https://happy.engineering) Co-Authored-By: Claude Co-Authored-By: Happy --- nanovllm/kvcache/offload_engine.py | 16 ++++++++++------ scripts/profile_offload.sh | 12 +++++++++++- 2 files changed, 21 insertions(+), 7 deletions(-) diff --git a/nanovllm/kvcache/offload_engine.py b/nanovllm/kvcache/offload_engine.py index e860daf..bcd832d 100644 --- a/nanovllm/kvcache/offload_engine.py +++ b/nanovllm/kvcache/offload_engine.py @@ -9,6 +9,7 @@ Key design principles for CUDA Graph compatibility: import torch import torch.cuda.nvtx +import nvtx from torch import Tensor from typing import Dict, List, Tuple, Optional from dataclasses import dataclass @@ -403,7 +404,8 @@ class OffloadEngine: nvtx_label = f"H2D: L{layer_id} Chunk{chunk_idx} CPU[{cpu_block_id}]->Slot[{slot_idx}]" else: nvtx_label = f"H2D: L{layer_id} CPU[{cpu_block_id}]->Slot[{slot_idx}]" - torch.cuda.nvtx.range_push(nvtx_label) + + nvtx.push_range(message=nvtx_label, color="blue") with torch.cuda.stream(stream): # Wait for previous compute on this slot to complete before overwriting # This prevents data race: transfer must not start until attention finishes reading @@ -421,7 +423,7 @@ class OffloadEngine: self.v_cache_cpu[layer_id, cpu_block_id], non_blocking=True ) self.ring_slot_ready[slot_idx].record(stream) - torch.cuda.nvtx.range_pop() + nvtx.pop_range() def wait_slot_layer(self, slot_idx: int) -> None: """ @@ -478,7 +480,8 @@ class OffloadEngine: else: self.sparse_policy.on_decode_offload(cpu_block_id, layer_id, k_cache, valid_tokens) - torch.cuda.nvtx.range_push(f"D2H: Slot[{slot_idx}]->CPU[L{layer_id},B{cpu_block_id}]") + nvtx_label = f"D2H: Slot[{slot_idx}]->CPU[L{layer_id},B{cpu_block_id}]" + nvtx.push_range(message=nvtx_label, color="green") with torch.cuda.stream(self.transfer_stream_main): # Wait for both compute_stream and default stream # - compute_stream: for flash attention operations @@ -494,7 +497,7 @@ class OffloadEngine: self.v_cache_gpu[slot_idx], non_blocking=True ) self.ring_slot_offload_done[slot_idx].record(self.transfer_stream_main) - torch.cuda.nvtx.range_pop() + nvtx.pop_range() # ----- KV access methods for ring buffer ----- @@ -792,7 +795,8 @@ class OffloadEngine: # Use per-layer stream for parallel offloads stream = self.prefill_offload_streams[layer_id] - torch.cuda.nvtx.range_push(f"AsyncPrefillOffload: L{layer_id}->CPU[{cpu_block_id}]") + nvtx_label = f"D2H: PrefillBuffer L{layer_id}->CPU[{cpu_block_id}]" + nvtx.push_range(message=nvtx_label, color="orange") with torch.cuda.stream(stream): # Wait for compute to finish writing to prefill buffer stream.wait_stream(self.compute_stream) @@ -807,7 +811,7 @@ class OffloadEngine: # Record completion event self.prefill_offload_events[layer_id].record(stream) - torch.cuda.nvtx.range_pop() + nvtx.pop_range() def wait_all_prefill_offloads(self) -> None: """Wait for all prefill buffer offloads to complete.""" diff --git a/scripts/profile_offload.sh b/scripts/profile_offload.sh index 4db42e0..86d7d80 100755 --- a/scripts/profile_offload.sh +++ b/scripts/profile_offload.sh @@ -9,6 +9,7 @@ # --dataset DATASET Task name (default: niah_single_1) # --sample INDEX Sample index (default: 0) # --gpu GPU_ID GPU to use (default: 0) +# --num-gpu-blocks N Number of GPU blocks/slots (default: 4) # --no-offload Disable CPU offload # # Output: @@ -18,6 +19,7 @@ # bash scripts/profile_offload.sh # bash scripts/profile_offload.sh --dataset niah_single_1 --sample 5 # bash scripts/profile_offload.sh --gpu 1 --no-offload +# bash scripts/profile_offload.sh --num-gpu-blocks 8 set -e @@ -25,6 +27,7 @@ set -e DATASET="niah_single_1" SAMPLE_INDEX="0" GPU_ID="0" +NUM_GPU_BLOCKS="4" ENABLE_OFFLOAD="--enable-offload" # Parse arguments @@ -46,6 +49,10 @@ while [[ $# -gt 0 ]]; do ENABLE_OFFLOAD="" shift ;; + --num-gpu-blocks) + NUM_GPU_BLOCKS="$2" + shift 2 + ;; -h|--help) echo "Usage: $0 [options]" echo "" @@ -54,6 +61,7 @@ while [[ $# -gt 0 ]]; do echo " --sample INDEX Sample index (default: 0)" echo " --gpu GPU_ID GPU to use (default: 0)" echo " --no-offload Disable CPU offload" + echo " --num-gpu-blocks N Number of GPU blocks/slots (default: 4)" exit 0 ;; *) @@ -76,7 +84,7 @@ mkdir -p "$OUTPUT_DIR" TIMESTAMP=$(date +%Y%m%d_%H%M%S) OFFLOAD_SUFFIX="" if [ -n "$ENABLE_OFFLOAD" ]; then - OFFLOAD_SUFFIX="_offload" + OFFLOAD_SUFFIX="_offload_${NUM_GPU_BLOCKS}slots" fi OUTPUT_FILE="$OUTPUT_DIR/ruler_${DATASET}_sample${SAMPLE_INDEX}${OFFLOAD_SUFFIX}_${TIMESTAMP}" @@ -87,6 +95,7 @@ echo "Test script: $TEST_SCRIPT" echo "Dataset: $DATASET" echo "Sample: $SAMPLE_INDEX" echo "GPU: $GPU_ID" +echo "GPU Blocks: $NUM_GPU_BLOCKS" echo "Offload: ${ENABLE_OFFLOAD:-disabled}" echo "Output file: $OUTPUT_FILE.nsys-rep" echo "" @@ -109,6 +118,7 @@ nsys profile \ python "$TEST_SCRIPT" \ --datasets "$DATASET" \ --sample-indices "$SAMPLE_INDEX" \ + --num-gpu-blocks "$NUM_GPU_BLOCKS" \ $ENABLE_OFFLOAD \ --quiet