Merge branch 'zijie/fix-dist-3': Fix distributed port conflict
- Auto port allocation with _find_free_port() in model_runner.py - Resource management refactor with close() + context manager in llm_engine.py - Add tests/test_port_conflict.py and tests/run_parallel_niah.sh - Remove docs/torch_distributed_port_issue.md (issue fixed) - Ignore tests/data/ directory Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -34,14 +34,56 @@ class LLMEngine:
|
||||
# Set Sequence.block_size to match the KV cache block size
|
||||
Sequence.block_size = config.kvcache_block_size
|
||||
self.scheduler = Scheduler(config, self.model_runner.kvcache_manager)
|
||||
atexit.register(self.exit)
|
||||
self._closed = False
|
||||
atexit.register(self._atexit_handler)
|
||||
|
||||
def exit(self):
|
||||
def _atexit_handler(self):
|
||||
"""Handler for atexit - only runs if close() wasn't called."""
|
||||
if not self._closed:
|
||||
self.close()
|
||||
|
||||
def close(self):
|
||||
"""Explicitly close the engine and release all resources.
|
||||
|
||||
This method is idempotent - calling it multiple times is safe.
|
||||
Supports: explicit close(), context manager, and __del__ fallback.
|
||||
"""
|
||||
if self._closed:
|
||||
return
|
||||
self._closed = True
|
||||
|
||||
# Unregister atexit to prevent double cleanup
|
||||
try:
|
||||
atexit.unregister(self._atexit_handler)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Cleanup resources
|
||||
self.model_runner.call("exit")
|
||||
del self.model_runner
|
||||
for p in self.ps:
|
||||
p.join()
|
||||
|
||||
def exit(self):
|
||||
"""Alias for close() - kept for backward compatibility."""
|
||||
self.close()
|
||||
|
||||
def __del__(self):
|
||||
"""Destructor - attempt cleanup if not already done."""
|
||||
try:
|
||||
self.close()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
def __enter__(self):
|
||||
"""Context manager entry."""
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type, exc_val, exc_tb):
|
||||
"""Context manager exit - ensures cleanup."""
|
||||
self.close()
|
||||
return False
|
||||
|
||||
def add_request(self, prompt: str | list[int], sampling_params: SamplingParams):
|
||||
if isinstance(prompt, str):
|
||||
prompt = self.tokenizer.encode(prompt)
|
||||
|
||||
@@ -1,4 +1,6 @@
|
||||
import os
|
||||
import pickle
|
||||
import socket
|
||||
import torch
|
||||
import torch.distributed as dist
|
||||
from multiprocessing.synchronize import Event
|
||||
@@ -16,6 +18,17 @@ from nanovllm.kvcache import create_kvcache_manager, KVCacheManager
|
||||
logger = get_logger("model_runner")
|
||||
|
||||
|
||||
def _find_free_port() -> int:
|
||||
"""Find a free port for distributed communication.
|
||||
|
||||
Uses socket binding with port 0 to let the OS assign an available port.
|
||||
"""
|
||||
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
|
||||
s.bind(('', 0))
|
||||
s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
|
||||
return s.getsockname()[1]
|
||||
|
||||
|
||||
class ModelRunner:
|
||||
|
||||
def __init__(self, config: Config, rank: int, event: Event | list[Event]):
|
||||
@@ -27,8 +40,13 @@ class ModelRunner:
|
||||
self.rank = rank
|
||||
self.event = event
|
||||
|
||||
import os
|
||||
port = os.environ.get("NANOVLLM_DIST_PORT", "2333")
|
||||
# Dynamic port allocation: use env var if set, otherwise find a free port
|
||||
env_port = os.environ.get("NANOVLLM_DIST_PORT")
|
||||
if env_port is not None:
|
||||
port = int(env_port)
|
||||
else:
|
||||
port = _find_free_port()
|
||||
logger.info(f"Auto-assigned distributed port: {port}")
|
||||
dist.init_process_group("nccl", f"tcp://localhost:{port}", world_size=self.world_size, rank=rank)
|
||||
torch.cuda.set_device(rank)
|
||||
default_dtype = torch.get_default_dtype()
|
||||
|
||||
Reference in New Issue
Block a user