⚡ perf: improve NVTX profiling with colored ranges and configurable slots
- Switch from torch.cuda.nvtx to nvtx package for colored range support - Add color coding: blue for H2D, green for D2H decode, orange for D2H prefill - Add --num-gpu-blocks parameter to profile_offload.sh - Include slot count in output filename for easier comparison Generated with [Claude Code](https://claude.ai/code) via [Happy](https://happy.engineering) Co-Authored-By: Claude <noreply@anthropic.com> Co-Authored-By: Happy <yesreply@happy.engineering>
This commit is contained in:
@@ -9,6 +9,7 @@ Key design principles for CUDA Graph compatibility:
|
|||||||
|
|
||||||
import torch
|
import torch
|
||||||
import torch.cuda.nvtx
|
import torch.cuda.nvtx
|
||||||
|
import nvtx
|
||||||
from torch import Tensor
|
from torch import Tensor
|
||||||
from typing import Dict, List, Tuple, Optional
|
from typing import Dict, List, Tuple, Optional
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
@@ -403,7 +404,8 @@ class OffloadEngine:
|
|||||||
nvtx_label = f"H2D: L{layer_id} Chunk{chunk_idx} CPU[{cpu_block_id}]->Slot[{slot_idx}]"
|
nvtx_label = f"H2D: L{layer_id} Chunk{chunk_idx} CPU[{cpu_block_id}]->Slot[{slot_idx}]"
|
||||||
else:
|
else:
|
||||||
nvtx_label = f"H2D: L{layer_id} CPU[{cpu_block_id}]->Slot[{slot_idx}]"
|
nvtx_label = f"H2D: L{layer_id} CPU[{cpu_block_id}]->Slot[{slot_idx}]"
|
||||||
torch.cuda.nvtx.range_push(nvtx_label)
|
|
||||||
|
nvtx.push_range(message=nvtx_label, color="blue")
|
||||||
with torch.cuda.stream(stream):
|
with torch.cuda.stream(stream):
|
||||||
# Wait for previous compute on this slot to complete before overwriting
|
# Wait for previous compute on this slot to complete before overwriting
|
||||||
# This prevents data race: transfer must not start until attention finishes reading
|
# This prevents data race: transfer must not start until attention finishes reading
|
||||||
@@ -421,7 +423,7 @@ class OffloadEngine:
|
|||||||
self.v_cache_cpu[layer_id, cpu_block_id], non_blocking=True
|
self.v_cache_cpu[layer_id, cpu_block_id], non_blocking=True
|
||||||
)
|
)
|
||||||
self.ring_slot_ready[slot_idx].record(stream)
|
self.ring_slot_ready[slot_idx].record(stream)
|
||||||
torch.cuda.nvtx.range_pop()
|
nvtx.pop_range()
|
||||||
|
|
||||||
def wait_slot_layer(self, slot_idx: int) -> None:
|
def wait_slot_layer(self, slot_idx: int) -> None:
|
||||||
"""
|
"""
|
||||||
@@ -478,7 +480,8 @@ class OffloadEngine:
|
|||||||
else:
|
else:
|
||||||
self.sparse_policy.on_decode_offload(cpu_block_id, layer_id, k_cache, valid_tokens)
|
self.sparse_policy.on_decode_offload(cpu_block_id, layer_id, k_cache, valid_tokens)
|
||||||
|
|
||||||
torch.cuda.nvtx.range_push(f"D2H: Slot[{slot_idx}]->CPU[L{layer_id},B{cpu_block_id}]")
|
nvtx_label = f"D2H: Slot[{slot_idx}]->CPU[L{layer_id},B{cpu_block_id}]"
|
||||||
|
nvtx.push_range(message=nvtx_label, color="green")
|
||||||
with torch.cuda.stream(self.transfer_stream_main):
|
with torch.cuda.stream(self.transfer_stream_main):
|
||||||
# Wait for both compute_stream and default stream
|
# Wait for both compute_stream and default stream
|
||||||
# - compute_stream: for flash attention operations
|
# - compute_stream: for flash attention operations
|
||||||
@@ -494,7 +497,7 @@ class OffloadEngine:
|
|||||||
self.v_cache_gpu[slot_idx], non_blocking=True
|
self.v_cache_gpu[slot_idx], non_blocking=True
|
||||||
)
|
)
|
||||||
self.ring_slot_offload_done[slot_idx].record(self.transfer_stream_main)
|
self.ring_slot_offload_done[slot_idx].record(self.transfer_stream_main)
|
||||||
torch.cuda.nvtx.range_pop()
|
nvtx.pop_range()
|
||||||
|
|
||||||
# ----- KV access methods for ring buffer -----
|
# ----- KV access methods for ring buffer -----
|
||||||
|
|
||||||
@@ -792,7 +795,8 @@ class OffloadEngine:
|
|||||||
# Use per-layer stream for parallel offloads
|
# Use per-layer stream for parallel offloads
|
||||||
stream = self.prefill_offload_streams[layer_id]
|
stream = self.prefill_offload_streams[layer_id]
|
||||||
|
|
||||||
torch.cuda.nvtx.range_push(f"AsyncPrefillOffload: L{layer_id}->CPU[{cpu_block_id}]")
|
nvtx_label = f"D2H: PrefillBuffer L{layer_id}->CPU[{cpu_block_id}]"
|
||||||
|
nvtx.push_range(message=nvtx_label, color="orange")
|
||||||
with torch.cuda.stream(stream):
|
with torch.cuda.stream(stream):
|
||||||
# Wait for compute to finish writing to prefill buffer
|
# Wait for compute to finish writing to prefill buffer
|
||||||
stream.wait_stream(self.compute_stream)
|
stream.wait_stream(self.compute_stream)
|
||||||
@@ -807,7 +811,7 @@ class OffloadEngine:
|
|||||||
|
|
||||||
# Record completion event
|
# Record completion event
|
||||||
self.prefill_offload_events[layer_id].record(stream)
|
self.prefill_offload_events[layer_id].record(stream)
|
||||||
torch.cuda.nvtx.range_pop()
|
nvtx.pop_range()
|
||||||
|
|
||||||
def wait_all_prefill_offloads(self) -> None:
|
def wait_all_prefill_offloads(self) -> None:
|
||||||
"""Wait for all prefill buffer offloads to complete."""
|
"""Wait for all prefill buffer offloads to complete."""
|
||||||
|
|||||||
@@ -9,6 +9,7 @@
|
|||||||
# --dataset DATASET Task name (default: niah_single_1)
|
# --dataset DATASET Task name (default: niah_single_1)
|
||||||
# --sample INDEX Sample index (default: 0)
|
# --sample INDEX Sample index (default: 0)
|
||||||
# --gpu GPU_ID GPU to use (default: 0)
|
# --gpu GPU_ID GPU to use (default: 0)
|
||||||
|
# --num-gpu-blocks N Number of GPU blocks/slots (default: 4)
|
||||||
# --no-offload Disable CPU offload
|
# --no-offload Disable CPU offload
|
||||||
#
|
#
|
||||||
# Output:
|
# Output:
|
||||||
@@ -18,6 +19,7 @@
|
|||||||
# bash scripts/profile_offload.sh
|
# bash scripts/profile_offload.sh
|
||||||
# bash scripts/profile_offload.sh --dataset niah_single_1 --sample 5
|
# bash scripts/profile_offload.sh --dataset niah_single_1 --sample 5
|
||||||
# bash scripts/profile_offload.sh --gpu 1 --no-offload
|
# bash scripts/profile_offload.sh --gpu 1 --no-offload
|
||||||
|
# bash scripts/profile_offload.sh --num-gpu-blocks 8
|
||||||
|
|
||||||
set -e
|
set -e
|
||||||
|
|
||||||
@@ -25,6 +27,7 @@ set -e
|
|||||||
DATASET="niah_single_1"
|
DATASET="niah_single_1"
|
||||||
SAMPLE_INDEX="0"
|
SAMPLE_INDEX="0"
|
||||||
GPU_ID="0"
|
GPU_ID="0"
|
||||||
|
NUM_GPU_BLOCKS="4"
|
||||||
ENABLE_OFFLOAD="--enable-offload"
|
ENABLE_OFFLOAD="--enable-offload"
|
||||||
|
|
||||||
# Parse arguments
|
# Parse arguments
|
||||||
@@ -46,6 +49,10 @@ while [[ $# -gt 0 ]]; do
|
|||||||
ENABLE_OFFLOAD=""
|
ENABLE_OFFLOAD=""
|
||||||
shift
|
shift
|
||||||
;;
|
;;
|
||||||
|
--num-gpu-blocks)
|
||||||
|
NUM_GPU_BLOCKS="$2"
|
||||||
|
shift 2
|
||||||
|
;;
|
||||||
-h|--help)
|
-h|--help)
|
||||||
echo "Usage: $0 [options]"
|
echo "Usage: $0 [options]"
|
||||||
echo ""
|
echo ""
|
||||||
@@ -54,6 +61,7 @@ while [[ $# -gt 0 ]]; do
|
|||||||
echo " --sample INDEX Sample index (default: 0)"
|
echo " --sample INDEX Sample index (default: 0)"
|
||||||
echo " --gpu GPU_ID GPU to use (default: 0)"
|
echo " --gpu GPU_ID GPU to use (default: 0)"
|
||||||
echo " --no-offload Disable CPU offload"
|
echo " --no-offload Disable CPU offload"
|
||||||
|
echo " --num-gpu-blocks N Number of GPU blocks/slots (default: 4)"
|
||||||
exit 0
|
exit 0
|
||||||
;;
|
;;
|
||||||
*)
|
*)
|
||||||
@@ -76,7 +84,7 @@ mkdir -p "$OUTPUT_DIR"
|
|||||||
TIMESTAMP=$(date +%Y%m%d_%H%M%S)
|
TIMESTAMP=$(date +%Y%m%d_%H%M%S)
|
||||||
OFFLOAD_SUFFIX=""
|
OFFLOAD_SUFFIX=""
|
||||||
if [ -n "$ENABLE_OFFLOAD" ]; then
|
if [ -n "$ENABLE_OFFLOAD" ]; then
|
||||||
OFFLOAD_SUFFIX="_offload"
|
OFFLOAD_SUFFIX="_offload_${NUM_GPU_BLOCKS}slots"
|
||||||
fi
|
fi
|
||||||
OUTPUT_FILE="$OUTPUT_DIR/ruler_${DATASET}_sample${SAMPLE_INDEX}${OFFLOAD_SUFFIX}_${TIMESTAMP}"
|
OUTPUT_FILE="$OUTPUT_DIR/ruler_${DATASET}_sample${SAMPLE_INDEX}${OFFLOAD_SUFFIX}_${TIMESTAMP}"
|
||||||
|
|
||||||
@@ -87,6 +95,7 @@ echo "Test script: $TEST_SCRIPT"
|
|||||||
echo "Dataset: $DATASET"
|
echo "Dataset: $DATASET"
|
||||||
echo "Sample: $SAMPLE_INDEX"
|
echo "Sample: $SAMPLE_INDEX"
|
||||||
echo "GPU: $GPU_ID"
|
echo "GPU: $GPU_ID"
|
||||||
|
echo "GPU Blocks: $NUM_GPU_BLOCKS"
|
||||||
echo "Offload: ${ENABLE_OFFLOAD:-disabled}"
|
echo "Offload: ${ENABLE_OFFLOAD:-disabled}"
|
||||||
echo "Output file: $OUTPUT_FILE.nsys-rep"
|
echo "Output file: $OUTPUT_FILE.nsys-rep"
|
||||||
echo ""
|
echo ""
|
||||||
@@ -109,6 +118,7 @@ nsys profile \
|
|||||||
python "$TEST_SCRIPT" \
|
python "$TEST_SCRIPT" \
|
||||||
--datasets "$DATASET" \
|
--datasets "$DATASET" \
|
||||||
--sample-indices "$SAMPLE_INDEX" \
|
--sample-indices "$SAMPLE_INDEX" \
|
||||||
|
--num-gpu-blocks "$NUM_GPU_BLOCKS" \
|
||||||
$ENABLE_OFFLOAD \
|
$ENABLE_OFFLOAD \
|
||||||
--quiet
|
--quiet
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user