diff --git a/tests/test_ruler_niah.sh b/tests/test_ruler_niah.sh new file mode 100755 index 0000000..c48d6b0 --- /dev/null +++ b/tests/test_ruler_niah.sh @@ -0,0 +1,242 @@ +#!/bin/bash +# +# RULER NIAH Parallel Test Script +# +# Runs RULER NIAH benchmark across multiple GPUs in parallel. +# Each sample is tested independently (separate Python process per sample). +# +# Usage: +# ./tests/test_ruler_niah.sh [OPTIONS] +# +# Options: +# --gpus "0,1,2,3" GPUs to use (default: "0,1,2,3") +# --total N Total samples to test (default: 100) +# --model PATH Model path (default: ~/models/Llama-3.1-8B-Instruct) +# --output FILE Output log file (default: /tmp/ruler_niah_results.log) +# + +# Note: Removed 'set -e' because ((var++)) returns 1 when var=0, which triggers exit + +# Default configuration +GPUS="0,1,2,3" +TOTAL_SAMPLES=100 +MODEL_PATH="$HOME/models/Llama-3.1-8B-Instruct" +OUTPUT_LOG="/tmp/ruler_niah_results.log" +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +PROJECT_ROOT="$(dirname "$SCRIPT_DIR")" + +# Parse arguments +while [[ $# -gt 0 ]]; do + case $1 in + --gpus) + GPUS="$2" + shift 2 + ;; + --total) + TOTAL_SAMPLES="$2" + shift 2 + ;; + --model) + MODEL_PATH="$2" + shift 2 + ;; + --output) + OUTPUT_LOG="$2" + shift 2 + ;; + *) + echo "Unknown option: $1" + exit 1 + ;; + esac +done + +# Convert GPU string to array +IFS=',' read -ra GPU_ARRAY <<< "$GPUS" +NUM_GPUS=${#GPU_ARRAY[@]} + +echo "============================================================" +echo "RULER NIAH Parallel Test" +echo "============================================================" +echo "GPUs: ${GPUS} (${NUM_GPUS} GPUs)" +echo "Total samples: ${TOTAL_SAMPLES}" +echo "Model: ${MODEL_PATH}" +echo "Output log: ${OUTPUT_LOG}" +echo "Project root: ${PROJECT_ROOT}" +echo "============================================================" +echo "" + +# Create output directory +mkdir -p "$(dirname "$OUTPUT_LOG")" + +# Initialize result tracking +RESULT_DIR="/tmp/ruler_niah_results_$$" +mkdir -p "$RESULT_DIR" + +# Function to run a single sample on a specific GPU +run_sample() { + local gpu=$1 + local sample_idx=$2 + local result_file="$RESULT_DIR/sample_${sample_idx}.result" + + # Run test with unique port based on GPU + local port=$((2333 + gpu)) + + NANOVLLM_DIST_PORT=$port \ + CUDA_VISIBLE_DEVICES=$gpu \ + PYTHONPATH="$PROJECT_ROOT:$PYTHONPATH" \ + python "$SCRIPT_DIR/test_ruler_niah.py" \ + --model "$MODEL_PATH" \ + --enable-offload \ + --sample-indices "$sample_idx" \ + --quiet \ + 2>&1 + + local exit_code=$? + if [ $exit_code -eq 0 ]; then + echo "PASS" > "$result_file" + else + echo "FAIL" > "$result_file" + fi + + return $exit_code +} + +# Function to run samples on a specific GPU +run_gpu_worker() { + local gpu=$1 + local gpu_idx=$2 + local log_file="$RESULT_DIR/gpu_${gpu}.log" + + echo "[GPU $gpu] Starting worker (gpu_idx=$gpu_idx)" | tee -a "$log_file" + + # Calculate which samples this GPU handles + local sample_idx=$gpu_idx + local pass_count=0 + local fail_count=0 + + while [ $sample_idx -lt $TOTAL_SAMPLES ]; do + echo "[GPU $gpu] Testing sample $sample_idx..." | tee -a "$log_file" + + local start_time=$(date +%s) + + if run_sample $gpu $sample_idx >> "$log_file" 2>&1; then + echo "[GPU $gpu] Sample $sample_idx: PASS" | tee -a "$log_file" + ((pass_count++)) + else + echo "[GPU $gpu] Sample $sample_idx: FAIL" | tee -a "$log_file" + ((fail_count++)) + fi + + local end_time=$(date +%s) + local duration=$((end_time - start_time)) + echo "[GPU $gpu] Sample $sample_idx completed in ${duration}s" | tee -a "$log_file" + + # Move to next sample for this GPU (stride by number of GPUs) + sample_idx=$((sample_idx + NUM_GPUS)) + + # Small delay to avoid port conflicts + sleep 2 + done + + echo "[GPU $gpu] Worker finished: $pass_count passed, $fail_count failed" | tee -a "$log_file" + echo "$pass_count $fail_count" > "$RESULT_DIR/gpu_${gpu}.summary" +} + +# Start time +START_TIME=$(date +%s) +echo "Starting parallel test at $(date '+%Y-%m-%d %H:%M:%S')" +echo "" + +# Launch workers for each GPU in background +PIDS=() +for i in "${!GPU_ARRAY[@]}"; do + gpu=${GPU_ARRAY[$i]} + echo "Launching worker on GPU $gpu..." + run_gpu_worker $gpu $i & + PIDS+=($!) +done + +echo "" +echo "All workers launched. Waiting for completion..." +echo "Monitor progress with: tail -f $RESULT_DIR/gpu_*.log" +echo "" + +# Wait for all workers to complete +for pid in "${PIDS[@]}"; do + wait $pid +done + +# End time +END_TIME=$(date +%s) +DURATION=$((END_TIME - START_TIME)) + +echo "" +echo "============================================================" +echo "FINAL RESULTS" +echo "============================================================" + +# Aggregate results +TOTAL_PASS=0 +TOTAL_FAIL=0 + +for gpu in "${GPU_ARRAY[@]}"; do + if [ -f "$RESULT_DIR/gpu_${gpu}.summary" ]; then + read pass fail < "$RESULT_DIR/gpu_${gpu}.summary" + TOTAL_PASS=$((TOTAL_PASS + pass)) + TOTAL_FAIL=$((TOTAL_FAIL + fail)) + echo "GPU $gpu: $pass passed, $fail failed" + fi +done + +TOTAL_TESTED=$((TOTAL_PASS + TOTAL_FAIL)) +if [ $TOTAL_TESTED -gt 0 ]; then + ACCURACY=$(echo "scale=1; $TOTAL_PASS * 100 / $TOTAL_TESTED" | bc) +else + ACCURACY="0.0" +fi + +echo "" +echo "------------------------------------------------------------" +echo "Total: $TOTAL_PASS/$TOTAL_TESTED passed ($ACCURACY%)" +echo "Duration: ${DURATION}s ($(echo "scale=1; $DURATION / 60" | bc) minutes)" +echo "Throughput: $(echo "scale=2; $TOTAL_TESTED * 60 / $DURATION" | bc) samples/min" +echo "------------------------------------------------------------" + +# Save detailed results +{ + echo "RULER NIAH Parallel Test Results" + echo "================================" + echo "Date: $(date '+%Y-%m-%d %H:%M:%S')" + echo "GPUs: $GPUS" + echo "Total samples: $TOTAL_TESTED" + echo "Passed: $TOTAL_PASS" + echo "Failed: $TOTAL_FAIL" + echo "Accuracy: $ACCURACY%" + echo "Duration: ${DURATION}s" + echo "" + echo "Per-sample results:" + for i in $(seq 0 $((TOTAL_SAMPLES - 1))); do + if [ -f "$RESULT_DIR/sample_${i}.result" ]; then + result=$(cat "$RESULT_DIR/sample_${i}.result") + echo "Sample $i: $result" + fi + done +} > "$OUTPUT_LOG" + +echo "" +echo "Detailed results saved to: $OUTPUT_LOG" + +# Cleanup +# rm -rf "$RESULT_DIR" + +# Exit with appropriate code +if [ $TOTAL_FAIL -eq 0 ]; then + echo "" + echo "test_ruler_niah.sh: ALL PASSED" + exit 0 +else + echo "" + echo "test_ruler_niah.sh: $TOTAL_FAIL FAILED" + exit 1 +fi