Files
nano-vllm/tests/run_parallel_niah.sh
Zijie Tian 64971c8e8a Merge branch 'zijie/fix-dist-3': Fix distributed port conflict
- Auto port allocation with _find_free_port() in model_runner.py
- Resource management refactor with close() + context manager in llm_engine.py
- Add tests/test_port_conflict.py and tests/run_parallel_niah.sh
- Remove docs/torch_distributed_port_issue.md (issue fixed)
- Ignore tests/data/ directory

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-12 16:27:25 +08:00

113 lines
2.8 KiB
Bash
Executable File

#!/bin/bash
# Run NIAH tests in parallel on 6 GPUs
# This tests the dynamic port allocation fix
set -e
MODEL="${1:-/home/zijie/models/Llama-3.1-8B-Instruct}"
PROJECT_ROOT="$(cd "$(dirname "$0")/.." && pwd)"
echo "=========================================="
echo "Parallel NIAH Test on 6 GPUs"
echo "=========================================="
echo "Model: $MODEL"
echo "Project: $PROJECT_ROOT"
echo ""
# Sample distribution (100 samples total):
# GPU 0: 0-16 (17 samples)
# GPU 1: 17-33 (17 samples)
# GPU 2: 34-50 (17 samples)
# GPU 3: 51-67 (17 samples)
# GPU 4: 68-83 (16 samples)
# GPU 5: 84-99 (16 samples)
declare -a RANGES=("0-16" "17-33" "34-50" "51-67" "68-83" "84-99")
declare -a PIDS=()
# Create log directory
LOG_DIR="$PROJECT_ROOT/logs"
mkdir -p "$LOG_DIR"
# Start all 6 processes
for gpu in {0..5}; do
range="${RANGES[$gpu]}"
log_file="$LOG_DIR/gpu${gpu}_${range}.log"
echo "Starting GPU $gpu: samples $range -> $log_file"
CUDA_VISIBLE_DEVICES=$gpu PYTHONPATH="$PROJECT_ROOT:$PYTHONPATH" \
python "$PROJECT_ROOT/tests/test_ruler_niah.py" \
--model "$MODEL" \
--sample-indices "$range" \
--enable-offload \
--num-gpu-blocks 4 \
--quiet \
> "$log_file" 2>&1 &
PIDS+=($!)
# Small delay to stagger starts
sleep 2
done
echo ""
echo "All 6 processes started. Waiting for completion..."
echo "PIDs: ${PIDS[*]}"
echo ""
# Wait for all processes and collect results
declare -a RESULTS=()
ALL_PASSED=true
for i in {0..5}; do
pid="${PIDS[$i]}"
range="${RANGES[$i]}"
log_file="$LOG_DIR/gpu${i}_${range}.log"
if wait $pid; then
RESULTS+=("GPU $i ($range): PASSED")
echo "GPU $i completed successfully"
else
RESULTS+=("GPU $i ($range): FAILED (exit code $?)")
ALL_PASSED=false
echo "GPU $i FAILED!"
fi
done
echo ""
echo "=========================================="
echo "RESULTS SUMMARY"
echo "=========================================="
for result in "${RESULTS[@]}"; do
echo "$result"
done
echo ""
# Show accuracy from each log
echo "Accuracy per GPU:"
for i in {0..5}; do
range="${RANGES[$i]}"
log_file="$LOG_DIR/gpu${i}_${range}.log"
if [ -f "$log_file" ]; then
accuracy=$(grep -E "Accuracy:|accuracy" "$log_file" | tail -1 || echo "N/A")
port=$(grep "Auto-assigned distributed port" "$log_file" | head -1 || echo "N/A")
echo " GPU $i ($range): $accuracy | $port"
fi
done
echo ""
if $ALL_PASSED; then
echo "=========================================="
echo "ALL 6 TESTS PASSED!"
echo "Dynamic port allocation works correctly."
echo "=========================================="
exit 0
else
echo "=========================================="
echo "SOME TESTS FAILED!"
echo "Check logs in $LOG_DIR"
echo "=========================================="
exit 1
fi