Merge branch 'zijie/fix-dist-3': Fix distributed port conflict

- Auto port allocation with _find_free_port() in model_runner.py - Resource management refactor with close() + context manager in llm_engine.py - Add tests/test_port_conflict.py and tests/run_parallel_niah.sh - Remove docs/torch_distributed_port_issue.md (issue fixed) - Ignore tests/data/ directory Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-12 16:20:44 +08:00
parent de6f36bdb2
commit 64971c8e8a
10 changed files with 784 additions and 792 deletions
--- a/tests/test_port_conflict.py
+++ b/tests/test_port_conflict.py
@@ -0,0 +1,198 @@
+"""Test for torch distributed port conflict fix.
+
+This test verifies that:
+1. Multiple independent processes can run simultaneously (dynamic port allocation)
+2. Sequential LLM creation in same process works (proper cleanup)
+
+Usage:
+    # Test parallel processes (requires 2 GPUs)
+    python tests/test_port_conflict.py --model ~/models/Qwen3-4B --gpus 4,5 --test parallel
+
+    # Test sequential creation in same process
+    CUDA_VISIBLE_DEVICES=4 python tests/test_port_conflict.py --model ~/models/Qwen3-4B --test sequential
+"""
+
+import argparse
+import os
+import subprocess
+import sys
+import time
+
+
+def test_sequential_creation(model_path: str, enable_offload: bool = True):
+    """Test creating multiple LLM instances sequentially in same process."""
+    # Add project root to path
+    project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+    sys.path.insert(0, project_root)
+
+    from nanovllm import LLM, SamplingParams
+
+    print("=" * 60)
+    print("Test: Sequential LLM Creation (same process)")
+    print("=" * 60)
+
+    for i in range(3):
+        print(f"\n--- Creating LLM instance {i+1}/3 ---")
+
+        llm_kwargs = {"enable_cpu_offload": enable_offload}
+        if enable_offload:
+            llm_kwargs["num_gpu_blocks"] = 2
+
+        llm = LLM(model_path, **llm_kwargs)
+
+        # Simple generation
+        outputs = llm.generate(
+            ["Hello, how are you?"],
+            SamplingParams(max_tokens=20)
+        )
+        print(f"Output: {outputs[0]['text'][:50]}...")
+
+        # Explicit cleanup
+        llm.close()
+        print(f"Instance {i+1} closed successfully")
+
+    print("\n" + "=" * 60)
+    print("PASSED: test_sequential_creation")
+    print("=" * 60)
+
+
+def test_context_manager(model_path: str, enable_offload: bool = True):
+    """Test LLM with context manager."""
+    project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+    sys.path.insert(0, project_root)
+
+    from nanovllm import LLM, SamplingParams
+
+    print("=" * 60)
+    print("Test: Context Manager")
+    print("=" * 60)
+
+    for i in range(2):
+        print(f"\n--- Context manager instance {i+1}/2 ---")
+
+        llm_kwargs = {"enable_cpu_offload": enable_offload}
+        if enable_offload:
+            llm_kwargs["num_gpu_blocks"] = 2
+
+        with LLM(model_path, **llm_kwargs) as llm:
+            outputs = llm.generate(
+                ["What is 2+2?"],
+                SamplingParams(max_tokens=20)
+            )
+            print(f"Output: {outputs[0]['text'][:50]}...")
+
+        print(f"Instance {i+1} auto-closed via context manager")
+
+    print("\n" + "=" * 60)
+    print("PASSED: test_context_manager")
+    print("=" * 60)
+
+
+def test_parallel_processes(model_path: str, gpus: str, enable_offload: bool = True):
+    """Test running multiple nanovllm processes in parallel."""
+    gpu_list = [int(g.strip()) for g in gpus.split(",")]
+    if len(gpu_list) < 2:
+        print("ERROR: Need at least 2 GPUs for parallel test")
+        return False
+
+    print("=" * 60)
+    print(f"Test: Parallel Processes (GPUs: {gpu_list})")
+    print("=" * 60)
+
+    project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+
+    # Script to run in each subprocess
+    script = f'''
+import sys
+sys.path.insert(0, "{project_root}")
+import os
+from nanovllm import LLM, SamplingParams
+
+gpu = os.environ.get("CUDA_VISIBLE_DEVICES", "?")
+print(f"[GPU {{gpu}}] Starting LLM...")
+
+llm_kwargs = {{"enable_cpu_offload": {enable_offload}}}
+if {enable_offload}:
+    llm_kwargs["num_gpu_blocks"] = 2
+
+llm = LLM("{model_path}", **llm_kwargs)
+print(f"[GPU {{gpu}}] LLM initialized, generating...")
+
+outputs = llm.generate(["Hello world"], SamplingParams(max_tokens=10))
+print(f"[GPU {{gpu}}] Output: {{outputs[0]['text'][:30]}}...")
+
+llm.close()
+print(f"[GPU {{gpu}}] Done")
+'''
+
+    # Start processes on different GPUs
+    procs = []
+    for i, gpu in enumerate(gpu_list[:2]):  # Use first 2 GPUs
+        print(f"\nStarting process on GPU {gpu}...")
+        env = os.environ.copy()
+        env["CUDA_VISIBLE_DEVICES"] = str(gpu)
+
+        p = subprocess.Popen(
+            [sys.executable, "-c", script],
+            env=env,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.STDOUT,
+            text=True
+        )
+        procs.append((gpu, p))
+        time.sleep(2)  # Stagger starts to see concurrent running
+
+    # Wait and collect results
+    all_passed = True
+    for gpu, p in procs:
+        stdout, _ = p.communicate(timeout=300)
+        print(f"\n--- GPU {gpu} output ---")
+        print(stdout)
+
+        if p.returncode != 0:
+            print(f"ERROR: GPU {gpu} process failed with code {p.returncode}")
+            all_passed = False
+        else:
+            print(f"GPU {gpu} process completed successfully")
+
+    print("\n" + "=" * 60)
+    if all_passed:
+        print("PASSED: test_parallel_processes")
+    else:
+        print("FAILED: test_parallel_processes")
+    print("=" * 60)
+
+    return all_passed
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Test port conflict fix")
+    parser.add_argument("--model", "-m", required=True, help="Path to model")
+    parser.add_argument("--gpus", default="0,1", help="GPUs to use for parallel test (comma-separated)")
+    parser.add_argument("--test", choices=["sequential", "context", "parallel", "all"],
+                        default="all", help="Which test to run")
+    parser.add_argument("--no-offload", action="store_true", help="Disable CPU offload")
+    args = parser.parse_args()
+
+    enable_offload = not args.no_offload
+    model_path = os.path.expanduser(args.model)
+
+    print(f"Model: {model_path}")
+    print(f"CPU Offload: {enable_offload}")
+    print(f"GPUs for parallel test: {args.gpus}")
+    print()
+
+    if args.test in ["sequential", "all"]:
+        test_sequential_creation(model_path, enable_offload)
+        print()
+
+    if args.test in ["context", "all"]:
+        test_context_manager(model_path, enable_offload)
+        print()
+
+    if args.test in ["parallel", "all"]:
+        test_parallel_processes(model_path, args.gpus, enable_offload)
+
+
+if __name__ == "__main__":
+    main()