diff --git a/tests/test_ruler_niah.sh b/tests/test_ruler_niah.sh
new file mode 100755
index 0000000..c48d6b0
--- /dev/null
+++ b/tests/test_ruler_niah.sh
@@ -0,0 +1,242 @@
+#!/bin/bash
+#
+# RULER NIAH Parallel Test Script
+#
+# Runs RULER NIAH benchmark across multiple GPUs in parallel.
+# Each sample is tested independently (separate Python process per sample).
+#
+# Usage:
+#   ./tests/test_ruler_niah.sh [OPTIONS]
+#
+# Options:
+#   --gpus "0,1,2,3"     GPUs to use (default: "0,1,2,3")
+#   --total N            Total samples to test (default: 100)
+#   --model PATH         Model path (default: ~/models/Llama-3.1-8B-Instruct)
+#   --output FILE        Output log file (default: /tmp/ruler_niah_results.log)
+#
+
+# Note: Removed 'set -e' because ((var++)) returns 1 when var=0, which triggers exit
+
+# Default configuration
+GPUS="0,1,2,3"
+TOTAL_SAMPLES=100
+MODEL_PATH="$HOME/models/Llama-3.1-8B-Instruct"
+OUTPUT_LOG="/tmp/ruler_niah_results.log"
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+PROJECT_ROOT="$(dirname "$SCRIPT_DIR")"
+
+# Parse arguments
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --gpus)
+            GPUS="$2"
+            shift 2
+            ;;
+        --total)
+            TOTAL_SAMPLES="$2"
+            shift 2
+            ;;
+        --model)
+            MODEL_PATH="$2"
+            shift 2
+            ;;
+        --output)
+            OUTPUT_LOG="$2"
+            shift 2
+            ;;
+        *)
+            echo "Unknown option: $1"
+            exit 1
+            ;;
+    esac
+done
+
+# Convert GPU string to array
+IFS=',' read -ra GPU_ARRAY <<< "$GPUS"
+NUM_GPUS=${#GPU_ARRAY[@]}
+
+echo "============================================================"
+echo "RULER NIAH Parallel Test"
+echo "============================================================"
+echo "GPUs: ${GPUS} (${NUM_GPUS} GPUs)"
+echo "Total samples: ${TOTAL_SAMPLES}"
+echo "Model: ${MODEL_PATH}"
+echo "Output log: ${OUTPUT_LOG}"
+echo "Project root: ${PROJECT_ROOT}"
+echo "============================================================"
+echo ""
+
+# Create output directory
+mkdir -p "$(dirname "$OUTPUT_LOG")"
+
+# Initialize result tracking
+RESULT_DIR="/tmp/ruler_niah_results_$$"
+mkdir -p "$RESULT_DIR"
+
+# Function to run a single sample on a specific GPU
+run_sample() {
+    local gpu=$1
+    local sample_idx=$2
+    local result_file="$RESULT_DIR/sample_${sample_idx}.result"
+
+    # Run test with unique port based on GPU
+    local port=$((2333 + gpu))
+
+    NANOVLLM_DIST_PORT=$port \
+    CUDA_VISIBLE_DEVICES=$gpu \
+    PYTHONPATH="$PROJECT_ROOT:$PYTHONPATH" \
+    python "$SCRIPT_DIR/test_ruler_niah.py" \
+        --model "$MODEL_PATH" \
+        --enable-offload \
+        --sample-indices "$sample_idx" \
+        --quiet \
+        2>&1
+
+    local exit_code=$?
+    if [ $exit_code -eq 0 ]; then
+        echo "PASS" > "$result_file"
+    else
+        echo "FAIL" > "$result_file"
+    fi
+
+    return $exit_code
+}
+
+# Function to run samples on a specific GPU
+run_gpu_worker() {
+    local gpu=$1
+    local gpu_idx=$2
+    local log_file="$RESULT_DIR/gpu_${gpu}.log"
+
+    echo "[GPU $gpu] Starting worker (gpu_idx=$gpu_idx)" | tee -a "$log_file"
+
+    # Calculate which samples this GPU handles
+    local sample_idx=$gpu_idx
+    local pass_count=0
+    local fail_count=0
+
+    while [ $sample_idx -lt $TOTAL_SAMPLES ]; do
+        echo "[GPU $gpu] Testing sample $sample_idx..." | tee -a "$log_file"
+
+        local start_time=$(date +%s)
+
+        if run_sample $gpu $sample_idx >> "$log_file" 2>&1; then
+            echo "[GPU $gpu] Sample $sample_idx: PASS" | tee -a "$log_file"
+            ((pass_count++))
+        else
+            echo "[GPU $gpu] Sample $sample_idx: FAIL" | tee -a "$log_file"
+            ((fail_count++))
+        fi
+
+        local end_time=$(date +%s)
+        local duration=$((end_time - start_time))
+        echo "[GPU $gpu] Sample $sample_idx completed in ${duration}s" | tee -a "$log_file"
+
+        # Move to next sample for this GPU (stride by number of GPUs)
+        sample_idx=$((sample_idx + NUM_GPUS))
+
+        # Small delay to avoid port conflicts
+        sleep 2
+    done
+
+    echo "[GPU $gpu] Worker finished: $pass_count passed, $fail_count failed" | tee -a "$log_file"
+    echo "$pass_count $fail_count" > "$RESULT_DIR/gpu_${gpu}.summary"
+}
+
+# Start time
+START_TIME=$(date +%s)
+echo "Starting parallel test at $(date '+%Y-%m-%d %H:%M:%S')"
+echo ""
+
+# Launch workers for each GPU in background
+PIDS=()
+for i in "${!GPU_ARRAY[@]}"; do
+    gpu=${GPU_ARRAY[$i]}
+    echo "Launching worker on GPU $gpu..."
+    run_gpu_worker $gpu $i &
+    PIDS+=($!)
+done
+
+echo ""
+echo "All workers launched. Waiting for completion..."
+echo "Monitor progress with: tail -f $RESULT_DIR/gpu_*.log"
+echo ""
+
+# Wait for all workers to complete
+for pid in "${PIDS[@]}"; do
+    wait $pid
+done
+
+# End time
+END_TIME=$(date +%s)
+DURATION=$((END_TIME - START_TIME))
+
+echo ""
+echo "============================================================"
+echo "FINAL RESULTS"
+echo "============================================================"
+
+# Aggregate results
+TOTAL_PASS=0
+TOTAL_FAIL=0
+
+for gpu in "${GPU_ARRAY[@]}"; do
+    if [ -f "$RESULT_DIR/gpu_${gpu}.summary" ]; then
+        read pass fail < "$RESULT_DIR/gpu_${gpu}.summary"
+        TOTAL_PASS=$((TOTAL_PASS + pass))
+        TOTAL_FAIL=$((TOTAL_FAIL + fail))
+        echo "GPU $gpu: $pass passed, $fail failed"
+    fi
+done
+
+TOTAL_TESTED=$((TOTAL_PASS + TOTAL_FAIL))
+if [ $TOTAL_TESTED -gt 0 ]; then
+    ACCURACY=$(echo "scale=1; $TOTAL_PASS * 100 / $TOTAL_TESTED" | bc)
+else
+    ACCURACY="0.0"
+fi
+
+echo ""
+echo "------------------------------------------------------------"
+echo "Total: $TOTAL_PASS/$TOTAL_TESTED passed ($ACCURACY%)"
+echo "Duration: ${DURATION}s ($(echo "scale=1; $DURATION / 60" | bc) minutes)"
+echo "Throughput: $(echo "scale=2; $TOTAL_TESTED * 60 / $DURATION" | bc) samples/min"
+echo "------------------------------------------------------------"
+
+# Save detailed results
+{
+    echo "RULER NIAH Parallel Test Results"
+    echo "================================"
+    echo "Date: $(date '+%Y-%m-%d %H:%M:%S')"
+    echo "GPUs: $GPUS"
+    echo "Total samples: $TOTAL_TESTED"
+    echo "Passed: $TOTAL_PASS"
+    echo "Failed: $TOTAL_FAIL"
+    echo "Accuracy: $ACCURACY%"
+    echo "Duration: ${DURATION}s"
+    echo ""
+    echo "Per-sample results:"
+    for i in $(seq 0 $((TOTAL_SAMPLES - 1))); do
+        if [ -f "$RESULT_DIR/sample_${i}.result" ]; then
+            result=$(cat "$RESULT_DIR/sample_${i}.result")
+            echo "Sample $i: $result"
+        fi
+    done
+} > "$OUTPUT_LOG"
+
+echo ""
+echo "Detailed results saved to: $OUTPUT_LOG"
+
+# Cleanup
+# rm -rf "$RESULT_DIR"
+
+# Exit with appropriate code
+if [ $TOTAL_FAIL -eq 0 ]; then
+    echo ""
+    echo "test_ruler_niah.sh: ALL PASSED"
+    exit 0
+else
+    echo ""
+    echo "test_ruler_niah.sh: $TOTAL_FAIL FAILED"
+    exit 1
+fi