#!/bin/bash # # RULER NIAH Parallel Test Script # # Runs RULER NIAH benchmark across multiple GPUs in parallel. # Each sample is tested independently (separate Python process per sample). # # Usage: # ./tests/test_ruler_niah.sh [OPTIONS] # # Options: # --gpus "0,1,2,3" GPUs to use (default: "0,1,2,3") # --total N Total samples to test (default: 100) # --model PATH Model path (default: ~/models/Llama-3.1-8B-Instruct) # --output FILE Output log file (default: /tmp/ruler_niah_results.log) # # Note: Removed 'set -e' because ((var++)) returns 1 when var=0, which triggers exit # Default configuration GPUS="0,1,2,3" TOTAL_SAMPLES=100 MODEL_PATH="$HOME/models/Llama-3.1-8B-Instruct" OUTPUT_LOG="/tmp/ruler_niah_results.log" SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" PROJECT_ROOT="$(dirname "$SCRIPT_DIR")" # Parse arguments while [[ $# -gt 0 ]]; do case $1 in --gpus) GPUS="$2" shift 2 ;; --total) TOTAL_SAMPLES="$2" shift 2 ;; --model) MODEL_PATH="$2" shift 2 ;; --output) OUTPUT_LOG="$2" shift 2 ;; *) echo "Unknown option: $1" exit 1 ;; esac done # Convert GPU string to array IFS=',' read -ra GPU_ARRAY <<< "$GPUS" NUM_GPUS=${#GPU_ARRAY[@]} echo "============================================================" echo "RULER NIAH Parallel Test" echo "============================================================" echo "GPUs: ${GPUS} (${NUM_GPUS} GPUs)" echo "Total samples: ${TOTAL_SAMPLES}" echo "Model: ${MODEL_PATH}" echo "Output log: ${OUTPUT_LOG}" echo "Project root: ${PROJECT_ROOT}" echo "============================================================" echo "" # Create output directory mkdir -p "$(dirname "$OUTPUT_LOG")" # Initialize result tracking RESULT_DIR="/tmp/ruler_niah_results_$$" mkdir -p "$RESULT_DIR" # Function to run a single sample on a specific GPU run_sample() { local gpu=$1 local sample_idx=$2 local result_file="$RESULT_DIR/sample_${sample_idx}.result" # Run test with unique port based on GPU local port=$((2333 + gpu)) NANOVLLM_DIST_PORT=$port \ CUDA_VISIBLE_DEVICES=$gpu \ PYTHONPATH="$PROJECT_ROOT:$PYTHONPATH" \ python "$SCRIPT_DIR/test_ruler_niah.py" \ --model "$MODEL_PATH" \ --enable-offload \ --sample-indices "$sample_idx" \ --quiet \ 2>&1 local exit_code=$? if [ $exit_code -eq 0 ]; then echo "PASS" > "$result_file" else echo "FAIL" > "$result_file" fi return $exit_code } # Function to run samples on a specific GPU run_gpu_worker() { local gpu=$1 local gpu_idx=$2 local log_file="$RESULT_DIR/gpu_${gpu}.log" echo "[GPU $gpu] Starting worker (gpu_idx=$gpu_idx)" | tee -a "$log_file" # Calculate which samples this GPU handles local sample_idx=$gpu_idx local pass_count=0 local fail_count=0 while [ $sample_idx -lt $TOTAL_SAMPLES ]; do echo "[GPU $gpu] Testing sample $sample_idx..." | tee -a "$log_file" local start_time=$(date +%s) if run_sample $gpu $sample_idx >> "$log_file" 2>&1; then echo "[GPU $gpu] Sample $sample_idx: PASS" | tee -a "$log_file" ((pass_count++)) else echo "[GPU $gpu] Sample $sample_idx: FAIL" | tee -a "$log_file" ((fail_count++)) fi local end_time=$(date +%s) local duration=$((end_time - start_time)) echo "[GPU $gpu] Sample $sample_idx completed in ${duration}s" | tee -a "$log_file" # Move to next sample for this GPU (stride by number of GPUs) sample_idx=$((sample_idx + NUM_GPUS)) # Small delay to avoid port conflicts sleep 2 done echo "[GPU $gpu] Worker finished: $pass_count passed, $fail_count failed" | tee -a "$log_file" echo "$pass_count $fail_count" > "$RESULT_DIR/gpu_${gpu}.summary" } # Start time START_TIME=$(date +%s) echo "Starting parallel test at $(date '+%Y-%m-%d %H:%M:%S')" echo "" # Launch workers for each GPU in background PIDS=() for i in "${!GPU_ARRAY[@]}"; do gpu=${GPU_ARRAY[$i]} echo "Launching worker on GPU $gpu..." run_gpu_worker $gpu $i & PIDS+=($!) done echo "" echo "All workers launched. Waiting for completion..." echo "Monitor progress with: tail -f $RESULT_DIR/gpu_*.log" echo "" # Wait for all workers to complete for pid in "${PIDS[@]}"; do wait $pid done # End time END_TIME=$(date +%s) DURATION=$((END_TIME - START_TIME)) echo "" echo "============================================================" echo "FINAL RESULTS" echo "============================================================" # Aggregate results TOTAL_PASS=0 TOTAL_FAIL=0 for gpu in "${GPU_ARRAY[@]}"; do if [ -f "$RESULT_DIR/gpu_${gpu}.summary" ]; then read pass fail < "$RESULT_DIR/gpu_${gpu}.summary" TOTAL_PASS=$((TOTAL_PASS + pass)) TOTAL_FAIL=$((TOTAL_FAIL + fail)) echo "GPU $gpu: $pass passed, $fail failed" fi done TOTAL_TESTED=$((TOTAL_PASS + TOTAL_FAIL)) if [ $TOTAL_TESTED -gt 0 ]; then ACCURACY=$(echo "scale=1; $TOTAL_PASS * 100 / $TOTAL_TESTED" | bc) else ACCURACY="0.0" fi echo "" echo "------------------------------------------------------------" echo "Total: $TOTAL_PASS/$TOTAL_TESTED passed ($ACCURACY%)" echo "Duration: ${DURATION}s ($(echo "scale=1; $DURATION / 60" | bc) minutes)" echo "Throughput: $(echo "scale=2; $TOTAL_TESTED * 60 / $DURATION" | bc) samples/min" echo "------------------------------------------------------------" # Save detailed results { echo "RULER NIAH Parallel Test Results" echo "================================" echo "Date: $(date '+%Y-%m-%d %H:%M:%S')" echo "GPUs: $GPUS" echo "Total samples: $TOTAL_TESTED" echo "Passed: $TOTAL_PASS" echo "Failed: $TOTAL_FAIL" echo "Accuracy: $ACCURACY%" echo "Duration: ${DURATION}s" echo "" echo "Per-sample results:" for i in $(seq 0 $((TOTAL_SAMPLES - 1))); do if [ -f "$RESULT_DIR/sample_${i}.result" ]; then result=$(cat "$RESULT_DIR/sample_${i}.result") echo "Sample $i: $result" fi done } > "$OUTPUT_LOG" echo "" echo "Detailed results saved to: $OUTPUT_LOG" # Cleanup # rm -rf "$RESULT_DIR" # Exit with appropriate code if [ $TOTAL_FAIL -eq 0 ]; then echo "" echo "test_ruler_niah.sh: ALL PASSED" exit 0 else echo "" echo "test_ruler_niah.sh: $TOTAL_FAIL FAILED" exit 1 fi