Add test_ruler_niah.sh for independent sample testing across multiple GPUs. Each sample runs in a separate Python process to avoid state accumulation issues. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
243 lines
6.5 KiB
Bash
Executable File
243 lines
6.5 KiB
Bash
Executable File
#!/bin/bash
|
|
#
|
|
# RULER NIAH Parallel Test Script
|
|
#
|
|
# Runs RULER NIAH benchmark across multiple GPUs in parallel.
|
|
# Each sample is tested independently (separate Python process per sample).
|
|
#
|
|
# Usage:
|
|
# ./tests/test_ruler_niah.sh [OPTIONS]
|
|
#
|
|
# Options:
|
|
# --gpus "0,1,2,3" GPUs to use (default: "0,1,2,3")
|
|
# --total N Total samples to test (default: 100)
|
|
# --model PATH Model path (default: ~/models/Llama-3.1-8B-Instruct)
|
|
# --output FILE Output log file (default: /tmp/ruler_niah_results.log)
|
|
#
|
|
|
|
# Note: Removed 'set -e' because ((var++)) returns 1 when var=0, which triggers exit
|
|
|
|
# Default configuration
|
|
GPUS="0,1,2,3"
|
|
TOTAL_SAMPLES=100
|
|
MODEL_PATH="$HOME/models/Llama-3.1-8B-Instruct"
|
|
OUTPUT_LOG="/tmp/ruler_niah_results.log"
|
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
PROJECT_ROOT="$(dirname "$SCRIPT_DIR")"
|
|
|
|
# Parse arguments
|
|
while [[ $# -gt 0 ]]; do
|
|
case $1 in
|
|
--gpus)
|
|
GPUS="$2"
|
|
shift 2
|
|
;;
|
|
--total)
|
|
TOTAL_SAMPLES="$2"
|
|
shift 2
|
|
;;
|
|
--model)
|
|
MODEL_PATH="$2"
|
|
shift 2
|
|
;;
|
|
--output)
|
|
OUTPUT_LOG="$2"
|
|
shift 2
|
|
;;
|
|
*)
|
|
echo "Unknown option: $1"
|
|
exit 1
|
|
;;
|
|
esac
|
|
done
|
|
|
|
# Convert GPU string to array
|
|
IFS=',' read -ra GPU_ARRAY <<< "$GPUS"
|
|
NUM_GPUS=${#GPU_ARRAY[@]}
|
|
|
|
echo "============================================================"
|
|
echo "RULER NIAH Parallel Test"
|
|
echo "============================================================"
|
|
echo "GPUs: ${GPUS} (${NUM_GPUS} GPUs)"
|
|
echo "Total samples: ${TOTAL_SAMPLES}"
|
|
echo "Model: ${MODEL_PATH}"
|
|
echo "Output log: ${OUTPUT_LOG}"
|
|
echo "Project root: ${PROJECT_ROOT}"
|
|
echo "============================================================"
|
|
echo ""
|
|
|
|
# Create output directory
|
|
mkdir -p "$(dirname "$OUTPUT_LOG")"
|
|
|
|
# Initialize result tracking
|
|
RESULT_DIR="/tmp/ruler_niah_results_$$"
|
|
mkdir -p "$RESULT_DIR"
|
|
|
|
# Function to run a single sample on a specific GPU
|
|
run_sample() {
|
|
local gpu=$1
|
|
local sample_idx=$2
|
|
local result_file="$RESULT_DIR/sample_${sample_idx}.result"
|
|
|
|
# Run test with unique port based on GPU
|
|
local port=$((2333 + gpu))
|
|
|
|
NANOVLLM_DIST_PORT=$port \
|
|
CUDA_VISIBLE_DEVICES=$gpu \
|
|
PYTHONPATH="$PROJECT_ROOT:$PYTHONPATH" \
|
|
python "$SCRIPT_DIR/test_ruler_niah.py" \
|
|
--model "$MODEL_PATH" \
|
|
--enable-offload \
|
|
--sample-indices "$sample_idx" \
|
|
--quiet \
|
|
2>&1
|
|
|
|
local exit_code=$?
|
|
if [ $exit_code -eq 0 ]; then
|
|
echo "PASS" > "$result_file"
|
|
else
|
|
echo "FAIL" > "$result_file"
|
|
fi
|
|
|
|
return $exit_code
|
|
}
|
|
|
|
# Function to run samples on a specific GPU
|
|
run_gpu_worker() {
|
|
local gpu=$1
|
|
local gpu_idx=$2
|
|
local log_file="$RESULT_DIR/gpu_${gpu}.log"
|
|
|
|
echo "[GPU $gpu] Starting worker (gpu_idx=$gpu_idx)" | tee -a "$log_file"
|
|
|
|
# Calculate which samples this GPU handles
|
|
local sample_idx=$gpu_idx
|
|
local pass_count=0
|
|
local fail_count=0
|
|
|
|
while [ $sample_idx -lt $TOTAL_SAMPLES ]; do
|
|
echo "[GPU $gpu] Testing sample $sample_idx..." | tee -a "$log_file"
|
|
|
|
local start_time=$(date +%s)
|
|
|
|
if run_sample $gpu $sample_idx >> "$log_file" 2>&1; then
|
|
echo "[GPU $gpu] Sample $sample_idx: PASS" | tee -a "$log_file"
|
|
((pass_count++))
|
|
else
|
|
echo "[GPU $gpu] Sample $sample_idx: FAIL" | tee -a "$log_file"
|
|
((fail_count++))
|
|
fi
|
|
|
|
local end_time=$(date +%s)
|
|
local duration=$((end_time - start_time))
|
|
echo "[GPU $gpu] Sample $sample_idx completed in ${duration}s" | tee -a "$log_file"
|
|
|
|
# Move to next sample for this GPU (stride by number of GPUs)
|
|
sample_idx=$((sample_idx + NUM_GPUS))
|
|
|
|
# Small delay to avoid port conflicts
|
|
sleep 2
|
|
done
|
|
|
|
echo "[GPU $gpu] Worker finished: $pass_count passed, $fail_count failed" | tee -a "$log_file"
|
|
echo "$pass_count $fail_count" > "$RESULT_DIR/gpu_${gpu}.summary"
|
|
}
|
|
|
|
# Start time
|
|
START_TIME=$(date +%s)
|
|
echo "Starting parallel test at $(date '+%Y-%m-%d %H:%M:%S')"
|
|
echo ""
|
|
|
|
# Launch workers for each GPU in background
|
|
PIDS=()
|
|
for i in "${!GPU_ARRAY[@]}"; do
|
|
gpu=${GPU_ARRAY[$i]}
|
|
echo "Launching worker on GPU $gpu..."
|
|
run_gpu_worker $gpu $i &
|
|
PIDS+=($!)
|
|
done
|
|
|
|
echo ""
|
|
echo "All workers launched. Waiting for completion..."
|
|
echo "Monitor progress with: tail -f $RESULT_DIR/gpu_*.log"
|
|
echo ""
|
|
|
|
# Wait for all workers to complete
|
|
for pid in "${PIDS[@]}"; do
|
|
wait $pid
|
|
done
|
|
|
|
# End time
|
|
END_TIME=$(date +%s)
|
|
DURATION=$((END_TIME - START_TIME))
|
|
|
|
echo ""
|
|
echo "============================================================"
|
|
echo "FINAL RESULTS"
|
|
echo "============================================================"
|
|
|
|
# Aggregate results
|
|
TOTAL_PASS=0
|
|
TOTAL_FAIL=0
|
|
|
|
for gpu in "${GPU_ARRAY[@]}"; do
|
|
if [ -f "$RESULT_DIR/gpu_${gpu}.summary" ]; then
|
|
read pass fail < "$RESULT_DIR/gpu_${gpu}.summary"
|
|
TOTAL_PASS=$((TOTAL_PASS + pass))
|
|
TOTAL_FAIL=$((TOTAL_FAIL + fail))
|
|
echo "GPU $gpu: $pass passed, $fail failed"
|
|
fi
|
|
done
|
|
|
|
TOTAL_TESTED=$((TOTAL_PASS + TOTAL_FAIL))
|
|
if [ $TOTAL_TESTED -gt 0 ]; then
|
|
ACCURACY=$(echo "scale=1; $TOTAL_PASS * 100 / $TOTAL_TESTED" | bc)
|
|
else
|
|
ACCURACY="0.0"
|
|
fi
|
|
|
|
echo ""
|
|
echo "------------------------------------------------------------"
|
|
echo "Total: $TOTAL_PASS/$TOTAL_TESTED passed ($ACCURACY%)"
|
|
echo "Duration: ${DURATION}s ($(echo "scale=1; $DURATION / 60" | bc) minutes)"
|
|
echo "Throughput: $(echo "scale=2; $TOTAL_TESTED * 60 / $DURATION" | bc) samples/min"
|
|
echo "------------------------------------------------------------"
|
|
|
|
# Save detailed results
|
|
{
|
|
echo "RULER NIAH Parallel Test Results"
|
|
echo "================================"
|
|
echo "Date: $(date '+%Y-%m-%d %H:%M:%S')"
|
|
echo "GPUs: $GPUS"
|
|
echo "Total samples: $TOTAL_TESTED"
|
|
echo "Passed: $TOTAL_PASS"
|
|
echo "Failed: $TOTAL_FAIL"
|
|
echo "Accuracy: $ACCURACY%"
|
|
echo "Duration: ${DURATION}s"
|
|
echo ""
|
|
echo "Per-sample results:"
|
|
for i in $(seq 0 $((TOTAL_SAMPLES - 1))); do
|
|
if [ -f "$RESULT_DIR/sample_${i}.result" ]; then
|
|
result=$(cat "$RESULT_DIR/sample_${i}.result")
|
|
echo "Sample $i: $result"
|
|
fi
|
|
done
|
|
} > "$OUTPUT_LOG"
|
|
|
|
echo ""
|
|
echo "Detailed results saved to: $OUTPUT_LOG"
|
|
|
|
# Cleanup
|
|
# rm -rf "$RESULT_DIR"
|
|
|
|
# Exit with appropriate code
|
|
if [ $TOTAL_FAIL -eq 0 ]; then
|
|
echo ""
|
|
echo "test_ruler_niah.sh: ALL PASSED"
|
|
exit 0
|
|
else
|
|
echo ""
|
|
echo "test_ruler_niah.sh: $TOTAL_FAIL FAILED"
|
|
exit 1
|
|
fi
|