test: add parallel multi-GPU RULER NIAH test script

Add test_ruler_niah.sh for independent sample testing across multiple GPUs.
Each sample runs in a separate Python process to avoid state accumulation issues.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Zijie Tian
2026-01-12 21:08:27 +08:00
parent 64971c8e8a
commit 1424e665e7

242
tests/test_ruler_niah.sh Executable file
View File

@@ -0,0 +1,242 @@
#!/bin/bash
#
# RULER NIAH Parallel Test Script
#
# Runs RULER NIAH benchmark across multiple GPUs in parallel.
# Each sample is tested independently (separate Python process per sample).
#
# Usage:
# ./tests/test_ruler_niah.sh [OPTIONS]
#
# Options:
# --gpus "0,1,2,3" GPUs to use (default: "0,1,2,3")
# --total N Total samples to test (default: 100)
# --model PATH Model path (default: ~/models/Llama-3.1-8B-Instruct)
# --output FILE Output log file (default: /tmp/ruler_niah_results.log)
#
# Note: Removed 'set -e' because ((var++)) returns 1 when var=0, which triggers exit
# Default configuration
GPUS="0,1,2,3"
TOTAL_SAMPLES=100
MODEL_PATH="$HOME/models/Llama-3.1-8B-Instruct"
OUTPUT_LOG="/tmp/ruler_niah_results.log"
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_ROOT="$(dirname "$SCRIPT_DIR")"
# Parse arguments
while [[ $# -gt 0 ]]; do
case $1 in
--gpus)
GPUS="$2"
shift 2
;;
--total)
TOTAL_SAMPLES="$2"
shift 2
;;
--model)
MODEL_PATH="$2"
shift 2
;;
--output)
OUTPUT_LOG="$2"
shift 2
;;
*)
echo "Unknown option: $1"
exit 1
;;
esac
done
# Convert GPU string to array
IFS=',' read -ra GPU_ARRAY <<< "$GPUS"
NUM_GPUS=${#GPU_ARRAY[@]}
echo "============================================================"
echo "RULER NIAH Parallel Test"
echo "============================================================"
echo "GPUs: ${GPUS} (${NUM_GPUS} GPUs)"
echo "Total samples: ${TOTAL_SAMPLES}"
echo "Model: ${MODEL_PATH}"
echo "Output log: ${OUTPUT_LOG}"
echo "Project root: ${PROJECT_ROOT}"
echo "============================================================"
echo ""
# Create output directory
mkdir -p "$(dirname "$OUTPUT_LOG")"
# Initialize result tracking
RESULT_DIR="/tmp/ruler_niah_results_$$"
mkdir -p "$RESULT_DIR"
# Function to run a single sample on a specific GPU
run_sample() {
local gpu=$1
local sample_idx=$2
local result_file="$RESULT_DIR/sample_${sample_idx}.result"
# Run test with unique port based on GPU
local port=$((2333 + gpu))
NANOVLLM_DIST_PORT=$port \
CUDA_VISIBLE_DEVICES=$gpu \
PYTHONPATH="$PROJECT_ROOT:$PYTHONPATH" \
python "$SCRIPT_DIR/test_ruler_niah.py" \
--model "$MODEL_PATH" \
--enable-offload \
--sample-indices "$sample_idx" \
--quiet \
2>&1
local exit_code=$?
if [ $exit_code -eq 0 ]; then
echo "PASS" > "$result_file"
else
echo "FAIL" > "$result_file"
fi
return $exit_code
}
# Function to run samples on a specific GPU
run_gpu_worker() {
local gpu=$1
local gpu_idx=$2
local log_file="$RESULT_DIR/gpu_${gpu}.log"
echo "[GPU $gpu] Starting worker (gpu_idx=$gpu_idx)" | tee -a "$log_file"
# Calculate which samples this GPU handles
local sample_idx=$gpu_idx
local pass_count=0
local fail_count=0
while [ $sample_idx -lt $TOTAL_SAMPLES ]; do
echo "[GPU $gpu] Testing sample $sample_idx..." | tee -a "$log_file"
local start_time=$(date +%s)
if run_sample $gpu $sample_idx >> "$log_file" 2>&1; then
echo "[GPU $gpu] Sample $sample_idx: PASS" | tee -a "$log_file"
((pass_count++))
else
echo "[GPU $gpu] Sample $sample_idx: FAIL" | tee -a "$log_file"
((fail_count++))
fi
local end_time=$(date +%s)
local duration=$((end_time - start_time))
echo "[GPU $gpu] Sample $sample_idx completed in ${duration}s" | tee -a "$log_file"
# Move to next sample for this GPU (stride by number of GPUs)
sample_idx=$((sample_idx + NUM_GPUS))
# Small delay to avoid port conflicts
sleep 2
done
echo "[GPU $gpu] Worker finished: $pass_count passed, $fail_count failed" | tee -a "$log_file"
echo "$pass_count $fail_count" > "$RESULT_DIR/gpu_${gpu}.summary"
}
# Start time
START_TIME=$(date +%s)
echo "Starting parallel test at $(date '+%Y-%m-%d %H:%M:%S')"
echo ""
# Launch workers for each GPU in background
PIDS=()
for i in "${!GPU_ARRAY[@]}"; do
gpu=${GPU_ARRAY[$i]}
echo "Launching worker on GPU $gpu..."
run_gpu_worker $gpu $i &
PIDS+=($!)
done
echo ""
echo "All workers launched. Waiting for completion..."
echo "Monitor progress with: tail -f $RESULT_DIR/gpu_*.log"
echo ""
# Wait for all workers to complete
for pid in "${PIDS[@]}"; do
wait $pid
done
# End time
END_TIME=$(date +%s)
DURATION=$((END_TIME - START_TIME))
echo ""
echo "============================================================"
echo "FINAL RESULTS"
echo "============================================================"
# Aggregate results
TOTAL_PASS=0
TOTAL_FAIL=0
for gpu in "${GPU_ARRAY[@]}"; do
if [ -f "$RESULT_DIR/gpu_${gpu}.summary" ]; then
read pass fail < "$RESULT_DIR/gpu_${gpu}.summary"
TOTAL_PASS=$((TOTAL_PASS + pass))
TOTAL_FAIL=$((TOTAL_FAIL + fail))
echo "GPU $gpu: $pass passed, $fail failed"
fi
done
TOTAL_TESTED=$((TOTAL_PASS + TOTAL_FAIL))
if [ $TOTAL_TESTED -gt 0 ]; then
ACCURACY=$(echo "scale=1; $TOTAL_PASS * 100 / $TOTAL_TESTED" | bc)
else
ACCURACY="0.0"
fi
echo ""
echo "------------------------------------------------------------"
echo "Total: $TOTAL_PASS/$TOTAL_TESTED passed ($ACCURACY%)"
echo "Duration: ${DURATION}s ($(echo "scale=1; $DURATION / 60" | bc) minutes)"
echo "Throughput: $(echo "scale=2; $TOTAL_TESTED * 60 / $DURATION" | bc) samples/min"
echo "------------------------------------------------------------"
# Save detailed results
{
echo "RULER NIAH Parallel Test Results"
echo "================================"
echo "Date: $(date '+%Y-%m-%d %H:%M:%S')"
echo "GPUs: $GPUS"
echo "Total samples: $TOTAL_TESTED"
echo "Passed: $TOTAL_PASS"
echo "Failed: $TOTAL_FAIL"
echo "Accuracy: $ACCURACY%"
echo "Duration: ${DURATION}s"
echo ""
echo "Per-sample results:"
for i in $(seq 0 $((TOTAL_SAMPLES - 1))); do
if [ -f "$RESULT_DIR/sample_${i}.result" ]; then
result=$(cat "$RESULT_DIR/sample_${i}.result")
echo "Sample $i: $result"
fi
done
} > "$OUTPUT_LOG"
echo ""
echo "Detailed results saved to: $OUTPUT_LOG"
# Cleanup
# rm -rf "$RESULT_DIR"
# Exit with appropriate code
if [ $TOTAL_FAIL -eq 0 ]; then
echo ""
echo "test_ruler_niah.sh: ALL PASSED"
exit 0
else
echo ""
echo "test_ruler_niah.sh: $TOTAL_FAIL FAILED"
exit 1
fi