Merge branch 'zijie/fix-dist-3': Fix distributed port conflict
- Auto port allocation with _find_free_port() in model_runner.py - Resource management refactor with close() + context manager in llm_engine.py - Add tests/test_port_conflict.py and tests/run_parallel_niah.sh - Remove docs/torch_distributed_port_issue.md (issue fixed) - Ignore tests/data/ directory Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
112
tests/run_parallel_niah.sh
Executable file
112
tests/run_parallel_niah.sh
Executable file
@@ -0,0 +1,112 @@
|
||||
#!/bin/bash
|
||||
# Run NIAH tests in parallel on 6 GPUs
|
||||
# This tests the dynamic port allocation fix
|
||||
|
||||
set -e
|
||||
|
||||
MODEL="${1:-/home/zijie/models/Llama-3.1-8B-Instruct}"
|
||||
PROJECT_ROOT="$(cd "$(dirname "$0")/.." && pwd)"
|
||||
|
||||
echo "=========================================="
|
||||
echo "Parallel NIAH Test on 6 GPUs"
|
||||
echo "=========================================="
|
||||
echo "Model: $MODEL"
|
||||
echo "Project: $PROJECT_ROOT"
|
||||
echo ""
|
||||
|
||||
# Sample distribution (100 samples total):
|
||||
# GPU 0: 0-16 (17 samples)
|
||||
# GPU 1: 17-33 (17 samples)
|
||||
# GPU 2: 34-50 (17 samples)
|
||||
# GPU 3: 51-67 (17 samples)
|
||||
# GPU 4: 68-83 (16 samples)
|
||||
# GPU 5: 84-99 (16 samples)
|
||||
|
||||
declare -a RANGES=("0-16" "17-33" "34-50" "51-67" "68-83" "84-99")
|
||||
declare -a PIDS=()
|
||||
|
||||
# Create log directory
|
||||
LOG_DIR="$PROJECT_ROOT/logs"
|
||||
mkdir -p "$LOG_DIR"
|
||||
|
||||
# Start all 6 processes
|
||||
for gpu in {0..5}; do
|
||||
range="${RANGES[$gpu]}"
|
||||
log_file="$LOG_DIR/gpu${gpu}_${range}.log"
|
||||
|
||||
echo "Starting GPU $gpu: samples $range -> $log_file"
|
||||
|
||||
CUDA_VISIBLE_DEVICES=$gpu PYTHONPATH="$PROJECT_ROOT:$PYTHONPATH" \
|
||||
python "$PROJECT_ROOT/tests/test_ruler_niah.py" \
|
||||
--model "$MODEL" \
|
||||
--sample-indices "$range" \
|
||||
--enable-offload \
|
||||
--num-gpu-blocks 4 \
|
||||
--quiet \
|
||||
> "$log_file" 2>&1 &
|
||||
|
||||
PIDS+=($!)
|
||||
|
||||
# Small delay to stagger starts
|
||||
sleep 2
|
||||
done
|
||||
|
||||
echo ""
|
||||
echo "All 6 processes started. Waiting for completion..."
|
||||
echo "PIDs: ${PIDS[*]}"
|
||||
echo ""
|
||||
|
||||
# Wait for all processes and collect results
|
||||
declare -a RESULTS=()
|
||||
ALL_PASSED=true
|
||||
|
||||
for i in {0..5}; do
|
||||
pid="${PIDS[$i]}"
|
||||
range="${RANGES[$i]}"
|
||||
log_file="$LOG_DIR/gpu${i}_${range}.log"
|
||||
|
||||
if wait $pid; then
|
||||
RESULTS+=("GPU $i ($range): PASSED")
|
||||
echo "GPU $i completed successfully"
|
||||
else
|
||||
RESULTS+=("GPU $i ($range): FAILED (exit code $?)")
|
||||
ALL_PASSED=false
|
||||
echo "GPU $i FAILED!"
|
||||
fi
|
||||
done
|
||||
|
||||
echo ""
|
||||
echo "=========================================="
|
||||
echo "RESULTS SUMMARY"
|
||||
echo "=========================================="
|
||||
for result in "${RESULTS[@]}"; do
|
||||
echo "$result"
|
||||
done
|
||||
echo ""
|
||||
|
||||
# Show accuracy from each log
|
||||
echo "Accuracy per GPU:"
|
||||
for i in {0..5}; do
|
||||
range="${RANGES[$i]}"
|
||||
log_file="$LOG_DIR/gpu${i}_${range}.log"
|
||||
if [ -f "$log_file" ]; then
|
||||
accuracy=$(grep -E "Accuracy:|accuracy" "$log_file" | tail -1 || echo "N/A")
|
||||
port=$(grep "Auto-assigned distributed port" "$log_file" | head -1 || echo "N/A")
|
||||
echo " GPU $i ($range): $accuracy | $port"
|
||||
fi
|
||||
done
|
||||
|
||||
echo ""
|
||||
if $ALL_PASSED; then
|
||||
echo "=========================================="
|
||||
echo "ALL 6 TESTS PASSED!"
|
||||
echo "Dynamic port allocation works correctly."
|
||||
echo "=========================================="
|
||||
exit 0
|
||||
else
|
||||
echo "=========================================="
|
||||
echo "SOME TESTS FAILED!"
|
||||
echo "Check logs in $LOG_DIR"
|
||||
echo "=========================================="
|
||||
exit 1
|
||||
fi
|
||||
Reference in New Issue
Block a user