✅ test: add parallel multi-GPU RULER NIAH test script
Add test_ruler_niah.sh for independent sample testing across multiple GPUs. Each sample runs in a separate Python process to avoid state accumulation issues. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
242
tests/test_ruler_niah.sh
Executable file
242
tests/test_ruler_niah.sh
Executable file
@@ -0,0 +1,242 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# RULER NIAH Parallel Test Script
|
||||
#
|
||||
# Runs RULER NIAH benchmark across multiple GPUs in parallel.
|
||||
# Each sample is tested independently (separate Python process per sample).
|
||||
#
|
||||
# Usage:
|
||||
# ./tests/test_ruler_niah.sh [OPTIONS]
|
||||
#
|
||||
# Options:
|
||||
# --gpus "0,1,2,3" GPUs to use (default: "0,1,2,3")
|
||||
# --total N Total samples to test (default: 100)
|
||||
# --model PATH Model path (default: ~/models/Llama-3.1-8B-Instruct)
|
||||
# --output FILE Output log file (default: /tmp/ruler_niah_results.log)
|
||||
#
|
||||
|
||||
# Note: Removed 'set -e' because ((var++)) returns 1 when var=0, which triggers exit
|
||||
|
||||
# Default configuration
|
||||
GPUS="0,1,2,3"
|
||||
TOTAL_SAMPLES=100
|
||||
MODEL_PATH="$HOME/models/Llama-3.1-8B-Instruct"
|
||||
OUTPUT_LOG="/tmp/ruler_niah_results.log"
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
PROJECT_ROOT="$(dirname "$SCRIPT_DIR")"
|
||||
|
||||
# Parse arguments
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case $1 in
|
||||
--gpus)
|
||||
GPUS="$2"
|
||||
shift 2
|
||||
;;
|
||||
--total)
|
||||
TOTAL_SAMPLES="$2"
|
||||
shift 2
|
||||
;;
|
||||
--model)
|
||||
MODEL_PATH="$2"
|
||||
shift 2
|
||||
;;
|
||||
--output)
|
||||
OUTPUT_LOG="$2"
|
||||
shift 2
|
||||
;;
|
||||
*)
|
||||
echo "Unknown option: $1"
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
# Convert GPU string to array
|
||||
IFS=',' read -ra GPU_ARRAY <<< "$GPUS"
|
||||
NUM_GPUS=${#GPU_ARRAY[@]}
|
||||
|
||||
echo "============================================================"
|
||||
echo "RULER NIAH Parallel Test"
|
||||
echo "============================================================"
|
||||
echo "GPUs: ${GPUS} (${NUM_GPUS} GPUs)"
|
||||
echo "Total samples: ${TOTAL_SAMPLES}"
|
||||
echo "Model: ${MODEL_PATH}"
|
||||
echo "Output log: ${OUTPUT_LOG}"
|
||||
echo "Project root: ${PROJECT_ROOT}"
|
||||
echo "============================================================"
|
||||
echo ""
|
||||
|
||||
# Create output directory
|
||||
mkdir -p "$(dirname "$OUTPUT_LOG")"
|
||||
|
||||
# Initialize result tracking
|
||||
RESULT_DIR="/tmp/ruler_niah_results_$$"
|
||||
mkdir -p "$RESULT_DIR"
|
||||
|
||||
# Function to run a single sample on a specific GPU
|
||||
run_sample() {
|
||||
local gpu=$1
|
||||
local sample_idx=$2
|
||||
local result_file="$RESULT_DIR/sample_${sample_idx}.result"
|
||||
|
||||
# Run test with unique port based on GPU
|
||||
local port=$((2333 + gpu))
|
||||
|
||||
NANOVLLM_DIST_PORT=$port \
|
||||
CUDA_VISIBLE_DEVICES=$gpu \
|
||||
PYTHONPATH="$PROJECT_ROOT:$PYTHONPATH" \
|
||||
python "$SCRIPT_DIR/test_ruler_niah.py" \
|
||||
--model "$MODEL_PATH" \
|
||||
--enable-offload \
|
||||
--sample-indices "$sample_idx" \
|
||||
--quiet \
|
||||
2>&1
|
||||
|
||||
local exit_code=$?
|
||||
if [ $exit_code -eq 0 ]; then
|
||||
echo "PASS" > "$result_file"
|
||||
else
|
||||
echo "FAIL" > "$result_file"
|
||||
fi
|
||||
|
||||
return $exit_code
|
||||
}
|
||||
|
||||
# Function to run samples on a specific GPU
|
||||
run_gpu_worker() {
|
||||
local gpu=$1
|
||||
local gpu_idx=$2
|
||||
local log_file="$RESULT_DIR/gpu_${gpu}.log"
|
||||
|
||||
echo "[GPU $gpu] Starting worker (gpu_idx=$gpu_idx)" | tee -a "$log_file"
|
||||
|
||||
# Calculate which samples this GPU handles
|
||||
local sample_idx=$gpu_idx
|
||||
local pass_count=0
|
||||
local fail_count=0
|
||||
|
||||
while [ $sample_idx -lt $TOTAL_SAMPLES ]; do
|
||||
echo "[GPU $gpu] Testing sample $sample_idx..." | tee -a "$log_file"
|
||||
|
||||
local start_time=$(date +%s)
|
||||
|
||||
if run_sample $gpu $sample_idx >> "$log_file" 2>&1; then
|
||||
echo "[GPU $gpu] Sample $sample_idx: PASS" | tee -a "$log_file"
|
||||
((pass_count++))
|
||||
else
|
||||
echo "[GPU $gpu] Sample $sample_idx: FAIL" | tee -a "$log_file"
|
||||
((fail_count++))
|
||||
fi
|
||||
|
||||
local end_time=$(date +%s)
|
||||
local duration=$((end_time - start_time))
|
||||
echo "[GPU $gpu] Sample $sample_idx completed in ${duration}s" | tee -a "$log_file"
|
||||
|
||||
# Move to next sample for this GPU (stride by number of GPUs)
|
||||
sample_idx=$((sample_idx + NUM_GPUS))
|
||||
|
||||
# Small delay to avoid port conflicts
|
||||
sleep 2
|
||||
done
|
||||
|
||||
echo "[GPU $gpu] Worker finished: $pass_count passed, $fail_count failed" | tee -a "$log_file"
|
||||
echo "$pass_count $fail_count" > "$RESULT_DIR/gpu_${gpu}.summary"
|
||||
}
|
||||
|
||||
# Start time
|
||||
START_TIME=$(date +%s)
|
||||
echo "Starting parallel test at $(date '+%Y-%m-%d %H:%M:%S')"
|
||||
echo ""
|
||||
|
||||
# Launch workers for each GPU in background
|
||||
PIDS=()
|
||||
for i in "${!GPU_ARRAY[@]}"; do
|
||||
gpu=${GPU_ARRAY[$i]}
|
||||
echo "Launching worker on GPU $gpu..."
|
||||
run_gpu_worker $gpu $i &
|
||||
PIDS+=($!)
|
||||
done
|
||||
|
||||
echo ""
|
||||
echo "All workers launched. Waiting for completion..."
|
||||
echo "Monitor progress with: tail -f $RESULT_DIR/gpu_*.log"
|
||||
echo ""
|
||||
|
||||
# Wait for all workers to complete
|
||||
for pid in "${PIDS[@]}"; do
|
||||
wait $pid
|
||||
done
|
||||
|
||||
# End time
|
||||
END_TIME=$(date +%s)
|
||||
DURATION=$((END_TIME - START_TIME))
|
||||
|
||||
echo ""
|
||||
echo "============================================================"
|
||||
echo "FINAL RESULTS"
|
||||
echo "============================================================"
|
||||
|
||||
# Aggregate results
|
||||
TOTAL_PASS=0
|
||||
TOTAL_FAIL=0
|
||||
|
||||
for gpu in "${GPU_ARRAY[@]}"; do
|
||||
if [ -f "$RESULT_DIR/gpu_${gpu}.summary" ]; then
|
||||
read pass fail < "$RESULT_DIR/gpu_${gpu}.summary"
|
||||
TOTAL_PASS=$((TOTAL_PASS + pass))
|
||||
TOTAL_FAIL=$((TOTAL_FAIL + fail))
|
||||
echo "GPU $gpu: $pass passed, $fail failed"
|
||||
fi
|
||||
done
|
||||
|
||||
TOTAL_TESTED=$((TOTAL_PASS + TOTAL_FAIL))
|
||||
if [ $TOTAL_TESTED -gt 0 ]; then
|
||||
ACCURACY=$(echo "scale=1; $TOTAL_PASS * 100 / $TOTAL_TESTED" | bc)
|
||||
else
|
||||
ACCURACY="0.0"
|
||||
fi
|
||||
|
||||
echo ""
|
||||
echo "------------------------------------------------------------"
|
||||
echo "Total: $TOTAL_PASS/$TOTAL_TESTED passed ($ACCURACY%)"
|
||||
echo "Duration: ${DURATION}s ($(echo "scale=1; $DURATION / 60" | bc) minutes)"
|
||||
echo "Throughput: $(echo "scale=2; $TOTAL_TESTED * 60 / $DURATION" | bc) samples/min"
|
||||
echo "------------------------------------------------------------"
|
||||
|
||||
# Save detailed results
|
||||
{
|
||||
echo "RULER NIAH Parallel Test Results"
|
||||
echo "================================"
|
||||
echo "Date: $(date '+%Y-%m-%d %H:%M:%S')"
|
||||
echo "GPUs: $GPUS"
|
||||
echo "Total samples: $TOTAL_TESTED"
|
||||
echo "Passed: $TOTAL_PASS"
|
||||
echo "Failed: $TOTAL_FAIL"
|
||||
echo "Accuracy: $ACCURACY%"
|
||||
echo "Duration: ${DURATION}s"
|
||||
echo ""
|
||||
echo "Per-sample results:"
|
||||
for i in $(seq 0 $((TOTAL_SAMPLES - 1))); do
|
||||
if [ -f "$RESULT_DIR/sample_${i}.result" ]; then
|
||||
result=$(cat "$RESULT_DIR/sample_${i}.result")
|
||||
echo "Sample $i: $result"
|
||||
fi
|
||||
done
|
||||
} > "$OUTPUT_LOG"
|
||||
|
||||
echo ""
|
||||
echo "Detailed results saved to: $OUTPUT_LOG"
|
||||
|
||||
# Cleanup
|
||||
# rm -rf "$RESULT_DIR"
|
||||
|
||||
# Exit with appropriate code
|
||||
if [ $TOTAL_FAIL -eq 0 ]; then
|
||||
echo ""
|
||||
echo "test_ruler_niah.sh: ALL PASSED"
|
||||
exit 0
|
||||
else
|
||||
echo ""
|
||||
echo "test_ruler_niah.sh: $TOTAL_FAIL FAILED"
|
||||
exit 1
|
||||
fi
|
||||
Reference in New Issue
Block a user