🔧 chore: add GPU-only profiling script
Add scripts/profile.sh for nsys profiling of GPU-only mode benchmarks. Usage: bash scripts/profile.sh # Default: 32K xattn prefill bash scripts/profile.sh --max-len 65536 --gpu-util 0.7 bash scripts/profile.sh --policy full bash scripts/profile.sh --bench-decode Output: results/nsys/bench_<policy>_<len>_<mode>_<timestamp>.nsys-rep Generated with [Claude Code](https://claude.ai/code) via [Happy](https://happy.engineering) Co-Authored-By: Claude <noreply@anthropic.com> Co-Authored-By: Happy <yesreply@happy.engineering>
This commit is contained in:
158
scripts/profile.sh
Executable file
158
scripts/profile.sh
Executable file
@@ -0,0 +1,158 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Profile bench.py using NVIDIA Nsight Systems (GPU-only mode)
|
||||
#
|
||||
# Usage:
|
||||
# bash scripts/profile.sh [options]
|
||||
#
|
||||
# Options:
|
||||
# --max-len LENGTH Max sequence length (default: 32768)
|
||||
# --policy POLICY Sparse policy: full, xattn (default: xattn)
|
||||
# --gpu GPU_ID GPU to use (default: 0)
|
||||
# --gpu-util UTIL GPU memory utilization (default: 0.9)
|
||||
# --input-len LENGTH Input length (default: max-len - 1)
|
||||
# --bench-decode Run decode benchmark instead of prefill
|
||||
#
|
||||
# Output:
|
||||
# results/nsys/bench_<policy>_<max_len>_<timestamp>.nsys-rep
|
||||
#
|
||||
# Examples:
|
||||
# bash scripts/profile.sh
|
||||
# bash scripts/profile.sh --max-len 65536 --gpu-util 0.7
|
||||
# bash scripts/profile.sh --policy full --max-len 32768
|
||||
# bash scripts/profile.sh --bench-decode
|
||||
|
||||
set -e
|
||||
|
||||
# Default configuration
|
||||
MAX_LEN="32768"
|
||||
POLICY="xattn"
|
||||
GPU_ID="0"
|
||||
GPU_UTIL="0.9"
|
||||
INPUT_LEN=""
|
||||
BENCH_MODE="prefill"
|
||||
|
||||
# Parse arguments
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case $1 in
|
||||
--max-len)
|
||||
MAX_LEN="$2"
|
||||
shift 2
|
||||
;;
|
||||
--policy)
|
||||
POLICY="$2"
|
||||
shift 2
|
||||
;;
|
||||
--gpu)
|
||||
GPU_ID="$2"
|
||||
shift 2
|
||||
;;
|
||||
--gpu-util)
|
||||
GPU_UTIL="$2"
|
||||
shift 2
|
||||
;;
|
||||
--input-len)
|
||||
INPUT_LEN="$2"
|
||||
shift 2
|
||||
;;
|
||||
--bench-decode)
|
||||
BENCH_MODE="decode"
|
||||
shift
|
||||
;;
|
||||
-h|--help)
|
||||
echo "Usage: $0 [options]"
|
||||
echo ""
|
||||
echo "Options:"
|
||||
echo " --max-len LENGTH Max sequence length (default: 32768)"
|
||||
echo " --policy POLICY Sparse policy: full, xattn (default: xattn)"
|
||||
echo " --gpu GPU_ID GPU to use (default: 0)"
|
||||
echo " --gpu-util UTIL GPU memory utilization (default: 0.9)"
|
||||
echo " --input-len LENGTH Input length (default: max-len - 1)"
|
||||
echo " --bench-decode Run decode benchmark instead of prefill"
|
||||
exit 0
|
||||
;;
|
||||
*)
|
||||
echo "Unknown option: $1"
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
# Path configuration
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
PROJECT_ROOT="$(dirname "$SCRIPT_DIR")"
|
||||
OUTPUT_DIR="$PROJECT_ROOT/results/nsys"
|
||||
BENCH_SCRIPT="$PROJECT_ROOT/bench.py"
|
||||
|
||||
# Create output directory if needed
|
||||
mkdir -p "$OUTPUT_DIR"
|
||||
|
||||
# Generate timestamp for unique filename
|
||||
TIMESTAMP=$(date +%Y%m%d_%H%M%S)
|
||||
|
||||
# Convert max_len to human-readable format (e.g., 32768 -> 32k)
|
||||
if [ "$MAX_LEN" -ge 1024 ]; then
|
||||
MAX_LEN_SUFFIX="$((MAX_LEN / 1024))k"
|
||||
else
|
||||
MAX_LEN_SUFFIX="${MAX_LEN}"
|
||||
fi
|
||||
|
||||
OUTPUT_FILE="$OUTPUT_DIR/bench_${POLICY}_${MAX_LEN_SUFFIX}_${BENCH_MODE}_${TIMESTAMP}"
|
||||
|
||||
# Build bench.py arguments
|
||||
BENCH_ARGS="--max-len $MAX_LEN --gpu-util $GPU_UTIL"
|
||||
|
||||
if [ -n "$POLICY" ]; then
|
||||
BENCH_ARGS="$BENCH_ARGS --policy $POLICY"
|
||||
fi
|
||||
|
||||
if [ -n "$INPUT_LEN" ]; then
|
||||
BENCH_ARGS="$BENCH_ARGS --input-len $INPUT_LEN"
|
||||
fi
|
||||
|
||||
if [ "$BENCH_MODE" = "decode" ]; then
|
||||
BENCH_ARGS="$BENCH_ARGS --bench-decode"
|
||||
fi
|
||||
|
||||
echo "============================================================"
|
||||
echo "NVIDIA Nsight Systems Profiling (GPU-only)"
|
||||
echo "============================================================"
|
||||
echo "Bench script: $BENCH_SCRIPT"
|
||||
echo "Policy: $POLICY"
|
||||
echo "Max length: $MAX_LEN"
|
||||
echo "GPU: $GPU_ID"
|
||||
echo "GPU util: $GPU_UTIL"
|
||||
echo "Bench mode: $BENCH_MODE"
|
||||
echo "Output file: $OUTPUT_FILE.nsys-rep"
|
||||
echo ""
|
||||
|
||||
# nsys profile options:
|
||||
# --trace=cuda,nvtx : Trace CUDA API and NVTX markers
|
||||
# --force-overwrite=true : Overwrite existing output file
|
||||
# --output=<path> : Output file path (without .nsys-rep extension)
|
||||
|
||||
echo "Running nsys profile..."
|
||||
echo "Command: python bench.py $BENCH_ARGS"
|
||||
echo ""
|
||||
|
||||
CUDA_VISIBLE_DEVICES=$GPU_ID PYTHONPATH="$PROJECT_ROOT:$PYTHONPATH" \
|
||||
nsys profile \
|
||||
--trace=cuda,nvtx \
|
||||
--force-overwrite=true \
|
||||
--output="$OUTPUT_FILE" \
|
||||
python "$BENCH_SCRIPT" $BENCH_ARGS
|
||||
|
||||
echo ""
|
||||
echo "============================================================"
|
||||
echo "Profiling completed successfully!"
|
||||
echo "============================================================"
|
||||
echo "Output file: $OUTPUT_FILE.nsys-rep"
|
||||
echo ""
|
||||
echo "To view results in GUI:"
|
||||
echo " nsight-sys $OUTPUT_FILE.nsys-rep"
|
||||
echo ""
|
||||
echo "To export statistics:"
|
||||
echo " nsys stats --report cuda_api_sum $OUTPUT_FILE.nsys-rep"
|
||||
echo " nsys stats --report cuda_gpu_kern_sum $OUTPUT_FILE.nsys-rep"
|
||||
echo " nsys stats --report cuda_gpu_mem_size_sum $OUTPUT_FILE.nsys-rep"
|
||||
echo "============================================================"
|
||||
Reference in New Issue
Block a user