perf: improve NVTX profiling with colored ranges and configurable slots

- Switch from torch.cuda.nvtx to nvtx package for colored range support
- Add color coding: blue for H2D, green for D2H decode, orange for D2H prefill
- Add --num-gpu-blocks parameter to profile_offload.sh
- Include slot count in output filename for easier comparison

Generated with [Claude Code](https://claude.ai/code)
via [Happy](https://happy.engineering)

Co-Authored-By: Claude <noreply@anthropic.com>
Co-Authored-By: Happy <yesreply@happy.engineering>
This commit is contained in:
Zijie Tian
2026-01-27 03:42:05 +08:00
parent aea3812230
commit 18bc433f09
2 changed files with 21 additions and 7 deletions

View File

@@ -9,6 +9,7 @@ Key design principles for CUDA Graph compatibility:
import torch import torch
import torch.cuda.nvtx import torch.cuda.nvtx
import nvtx
from torch import Tensor from torch import Tensor
from typing import Dict, List, Tuple, Optional from typing import Dict, List, Tuple, Optional
from dataclasses import dataclass from dataclasses import dataclass
@@ -403,7 +404,8 @@ class OffloadEngine:
nvtx_label = f"H2D: L{layer_id} Chunk{chunk_idx} CPU[{cpu_block_id}]->Slot[{slot_idx}]" nvtx_label = f"H2D: L{layer_id} Chunk{chunk_idx} CPU[{cpu_block_id}]->Slot[{slot_idx}]"
else: else:
nvtx_label = f"H2D: L{layer_id} CPU[{cpu_block_id}]->Slot[{slot_idx}]" nvtx_label = f"H2D: L{layer_id} CPU[{cpu_block_id}]->Slot[{slot_idx}]"
torch.cuda.nvtx.range_push(nvtx_label)
nvtx.push_range(message=nvtx_label, color="blue")
with torch.cuda.stream(stream): with torch.cuda.stream(stream):
# Wait for previous compute on this slot to complete before overwriting # Wait for previous compute on this slot to complete before overwriting
# This prevents data race: transfer must not start until attention finishes reading # This prevents data race: transfer must not start until attention finishes reading
@@ -421,7 +423,7 @@ class OffloadEngine:
self.v_cache_cpu[layer_id, cpu_block_id], non_blocking=True self.v_cache_cpu[layer_id, cpu_block_id], non_blocking=True
) )
self.ring_slot_ready[slot_idx].record(stream) self.ring_slot_ready[slot_idx].record(stream)
torch.cuda.nvtx.range_pop() nvtx.pop_range()
def wait_slot_layer(self, slot_idx: int) -> None: def wait_slot_layer(self, slot_idx: int) -> None:
""" """
@@ -478,7 +480,8 @@ class OffloadEngine:
else: else:
self.sparse_policy.on_decode_offload(cpu_block_id, layer_id, k_cache, valid_tokens) self.sparse_policy.on_decode_offload(cpu_block_id, layer_id, k_cache, valid_tokens)
torch.cuda.nvtx.range_push(f"D2H: Slot[{slot_idx}]->CPU[L{layer_id},B{cpu_block_id}]") nvtx_label = f"D2H: Slot[{slot_idx}]->CPU[L{layer_id},B{cpu_block_id}]"
nvtx.push_range(message=nvtx_label, color="green")
with torch.cuda.stream(self.transfer_stream_main): with torch.cuda.stream(self.transfer_stream_main):
# Wait for both compute_stream and default stream # Wait for both compute_stream and default stream
# - compute_stream: for flash attention operations # - compute_stream: for flash attention operations
@@ -494,7 +497,7 @@ class OffloadEngine:
self.v_cache_gpu[slot_idx], non_blocking=True self.v_cache_gpu[slot_idx], non_blocking=True
) )
self.ring_slot_offload_done[slot_idx].record(self.transfer_stream_main) self.ring_slot_offload_done[slot_idx].record(self.transfer_stream_main)
torch.cuda.nvtx.range_pop() nvtx.pop_range()
# ----- KV access methods for ring buffer ----- # ----- KV access methods for ring buffer -----
@@ -792,7 +795,8 @@ class OffloadEngine:
# Use per-layer stream for parallel offloads # Use per-layer stream for parallel offloads
stream = self.prefill_offload_streams[layer_id] stream = self.prefill_offload_streams[layer_id]
torch.cuda.nvtx.range_push(f"AsyncPrefillOffload: L{layer_id}->CPU[{cpu_block_id}]") nvtx_label = f"D2H: PrefillBuffer L{layer_id}->CPU[{cpu_block_id}]"
nvtx.push_range(message=nvtx_label, color="orange")
with torch.cuda.stream(stream): with torch.cuda.stream(stream):
# Wait for compute to finish writing to prefill buffer # Wait for compute to finish writing to prefill buffer
stream.wait_stream(self.compute_stream) stream.wait_stream(self.compute_stream)
@@ -807,7 +811,7 @@ class OffloadEngine:
# Record completion event # Record completion event
self.prefill_offload_events[layer_id].record(stream) self.prefill_offload_events[layer_id].record(stream)
torch.cuda.nvtx.range_pop() nvtx.pop_range()
def wait_all_prefill_offloads(self) -> None: def wait_all_prefill_offloads(self) -> None:
"""Wait for all prefill buffer offloads to complete.""" """Wait for all prefill buffer offloads to complete."""

View File

@@ -9,6 +9,7 @@
# --dataset DATASET Task name (default: niah_single_1) # --dataset DATASET Task name (default: niah_single_1)
# --sample INDEX Sample index (default: 0) # --sample INDEX Sample index (default: 0)
# --gpu GPU_ID GPU to use (default: 0) # --gpu GPU_ID GPU to use (default: 0)
# --num-gpu-blocks N Number of GPU blocks/slots (default: 4)
# --no-offload Disable CPU offload # --no-offload Disable CPU offload
# #
# Output: # Output:
@@ -18,6 +19,7 @@
# bash scripts/profile_offload.sh # bash scripts/profile_offload.sh
# bash scripts/profile_offload.sh --dataset niah_single_1 --sample 5 # bash scripts/profile_offload.sh --dataset niah_single_1 --sample 5
# bash scripts/profile_offload.sh --gpu 1 --no-offload # bash scripts/profile_offload.sh --gpu 1 --no-offload
# bash scripts/profile_offload.sh --num-gpu-blocks 8
set -e set -e
@@ -25,6 +27,7 @@ set -e
DATASET="niah_single_1" DATASET="niah_single_1"
SAMPLE_INDEX="0" SAMPLE_INDEX="0"
GPU_ID="0" GPU_ID="0"
NUM_GPU_BLOCKS="4"
ENABLE_OFFLOAD="--enable-offload" ENABLE_OFFLOAD="--enable-offload"
# Parse arguments # Parse arguments
@@ -46,6 +49,10 @@ while [[ $# -gt 0 ]]; do
ENABLE_OFFLOAD="" ENABLE_OFFLOAD=""
shift shift
;; ;;
--num-gpu-blocks)
NUM_GPU_BLOCKS="$2"
shift 2
;;
-h|--help) -h|--help)
echo "Usage: $0 [options]" echo "Usage: $0 [options]"
echo "" echo ""
@@ -54,6 +61,7 @@ while [[ $# -gt 0 ]]; do
echo " --sample INDEX Sample index (default: 0)" echo " --sample INDEX Sample index (default: 0)"
echo " --gpu GPU_ID GPU to use (default: 0)" echo " --gpu GPU_ID GPU to use (default: 0)"
echo " --no-offload Disable CPU offload" echo " --no-offload Disable CPU offload"
echo " --num-gpu-blocks N Number of GPU blocks/slots (default: 4)"
exit 0 exit 0
;; ;;
*) *)
@@ -76,7 +84,7 @@ mkdir -p "$OUTPUT_DIR"
TIMESTAMP=$(date +%Y%m%d_%H%M%S) TIMESTAMP=$(date +%Y%m%d_%H%M%S)
OFFLOAD_SUFFIX="" OFFLOAD_SUFFIX=""
if [ -n "$ENABLE_OFFLOAD" ]; then if [ -n "$ENABLE_OFFLOAD" ]; then
OFFLOAD_SUFFIX="_offload" OFFLOAD_SUFFIX="_offload_${NUM_GPU_BLOCKS}slots"
fi fi
OUTPUT_FILE="$OUTPUT_DIR/ruler_${DATASET}_sample${SAMPLE_INDEX}${OFFLOAD_SUFFIX}_${TIMESTAMP}" OUTPUT_FILE="$OUTPUT_DIR/ruler_${DATASET}_sample${SAMPLE_INDEX}${OFFLOAD_SUFFIX}_${TIMESTAMP}"
@@ -87,6 +95,7 @@ echo "Test script: $TEST_SCRIPT"
echo "Dataset: $DATASET" echo "Dataset: $DATASET"
echo "Sample: $SAMPLE_INDEX" echo "Sample: $SAMPLE_INDEX"
echo "GPU: $GPU_ID" echo "GPU: $GPU_ID"
echo "GPU Blocks: $NUM_GPU_BLOCKS"
echo "Offload: ${ENABLE_OFFLOAD:-disabled}" echo "Offload: ${ENABLE_OFFLOAD:-disabled}"
echo "Output file: $OUTPUT_FILE.nsys-rep" echo "Output file: $OUTPUT_FILE.nsys-rep"
echo "" echo ""
@@ -109,6 +118,7 @@ nsys profile \
python "$TEST_SCRIPT" \ python "$TEST_SCRIPT" \
--datasets "$DATASET" \ --datasets "$DATASET" \
--sample-indices "$SAMPLE_INDEX" \ --sample-indices "$SAMPLE_INDEX" \
--num-gpu-blocks "$NUM_GPU_BLOCKS" \
$ENABLE_OFFLOAD \ $ENABLE_OFFLOAD \
--quiet --quiet