diff --git a/scripts/profile.sh b/scripts/profile.sh new file mode 100755 index 0000000..95fbad2 --- /dev/null +++ b/scripts/profile.sh @@ -0,0 +1,158 @@ +#!/bin/bash + +# Profile bench.py using NVIDIA Nsight Systems (GPU-only mode) +# +# Usage: +# bash scripts/profile.sh [options] +# +# Options: +# --max-len LENGTH Max sequence length (default: 32768) +# --policy POLICY Sparse policy: full, xattn (default: xattn) +# --gpu GPU_ID GPU to use (default: 0) +# --gpu-util UTIL GPU memory utilization (default: 0.9) +# --input-len LENGTH Input length (default: max-len - 1) +# --bench-decode Run decode benchmark instead of prefill +# +# Output: +# results/nsys/bench___.nsys-rep +# +# Examples: +# bash scripts/profile.sh +# bash scripts/profile.sh --max-len 65536 --gpu-util 0.7 +# bash scripts/profile.sh --policy full --max-len 32768 +# bash scripts/profile.sh --bench-decode + +set -e + +# Default configuration +MAX_LEN="32768" +POLICY="xattn" +GPU_ID="0" +GPU_UTIL="0.9" +INPUT_LEN="" +BENCH_MODE="prefill" + +# Parse arguments +while [[ $# -gt 0 ]]; do + case $1 in + --max-len) + MAX_LEN="$2" + shift 2 + ;; + --policy) + POLICY="$2" + shift 2 + ;; + --gpu) + GPU_ID="$2" + shift 2 + ;; + --gpu-util) + GPU_UTIL="$2" + shift 2 + ;; + --input-len) + INPUT_LEN="$2" + shift 2 + ;; + --bench-decode) + BENCH_MODE="decode" + shift + ;; + -h|--help) + echo "Usage: $0 [options]" + echo "" + echo "Options:" + echo " --max-len LENGTH Max sequence length (default: 32768)" + echo " --policy POLICY Sparse policy: full, xattn (default: xattn)" + echo " --gpu GPU_ID GPU to use (default: 0)" + echo " --gpu-util UTIL GPU memory utilization (default: 0.9)" + echo " --input-len LENGTH Input length (default: max-len - 1)" + echo " --bench-decode Run decode benchmark instead of prefill" + exit 0 + ;; + *) + echo "Unknown option: $1" + exit 1 + ;; + esac +done + +# Path configuration +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +PROJECT_ROOT="$(dirname "$SCRIPT_DIR")" +OUTPUT_DIR="$PROJECT_ROOT/results/nsys" +BENCH_SCRIPT="$PROJECT_ROOT/bench.py" + +# Create output directory if needed +mkdir -p "$OUTPUT_DIR" + +# Generate timestamp for unique filename +TIMESTAMP=$(date +%Y%m%d_%H%M%S) + +# Convert max_len to human-readable format (e.g., 32768 -> 32k) +if [ "$MAX_LEN" -ge 1024 ]; then + MAX_LEN_SUFFIX="$((MAX_LEN / 1024))k" +else + MAX_LEN_SUFFIX="${MAX_LEN}" +fi + +OUTPUT_FILE="$OUTPUT_DIR/bench_${POLICY}_${MAX_LEN_SUFFIX}_${BENCH_MODE}_${TIMESTAMP}" + +# Build bench.py arguments +BENCH_ARGS="--max-len $MAX_LEN --gpu-util $GPU_UTIL" + +if [ -n "$POLICY" ]; then + BENCH_ARGS="$BENCH_ARGS --policy $POLICY" +fi + +if [ -n "$INPUT_LEN" ]; then + BENCH_ARGS="$BENCH_ARGS --input-len $INPUT_LEN" +fi + +if [ "$BENCH_MODE" = "decode" ]; then + BENCH_ARGS="$BENCH_ARGS --bench-decode" +fi + +echo "============================================================" +echo "NVIDIA Nsight Systems Profiling (GPU-only)" +echo "============================================================" +echo "Bench script: $BENCH_SCRIPT" +echo "Policy: $POLICY" +echo "Max length: $MAX_LEN" +echo "GPU: $GPU_ID" +echo "GPU util: $GPU_UTIL" +echo "Bench mode: $BENCH_MODE" +echo "Output file: $OUTPUT_FILE.nsys-rep" +echo "" + +# nsys profile options: +# --trace=cuda,nvtx : Trace CUDA API and NVTX markers +# --force-overwrite=true : Overwrite existing output file +# --output= : Output file path (without .nsys-rep extension) + +echo "Running nsys profile..." +echo "Command: python bench.py $BENCH_ARGS" +echo "" + +CUDA_VISIBLE_DEVICES=$GPU_ID PYTHONPATH="$PROJECT_ROOT:$PYTHONPATH" \ +nsys profile \ + --trace=cuda,nvtx \ + --force-overwrite=true \ + --output="$OUTPUT_FILE" \ + python "$BENCH_SCRIPT" $BENCH_ARGS + +echo "" +echo "============================================================" +echo "Profiling completed successfully!" +echo "============================================================" +echo "Output file: $OUTPUT_FILE.nsys-rep" +echo "" +echo "To view results in GUI:" +echo " nsight-sys $OUTPUT_FILE.nsys-rep" +echo "" +echo "To export statistics:" +echo " nsys stats --report cuda_api_sum $OUTPUT_FILE.nsys-rep" +echo " nsys stats --report cuda_gpu_kern_sum $OUTPUT_FILE.nsys-rep" +echo " nsys stats --report cuda_gpu_mem_size_sum $OUTPUT_FILE.nsys-rep" +echo "============================================================"