diff --git a/nanovllm/__init__.py b/nanovllm/__init__.py index e84e8cc..551af23 100644 --- a/nanovllm/__init__.py +++ b/nanovllm/__init__.py @@ -1,2 +1,2 @@ from nanovllm.llm import LLM -from nanovllm.sampling_params import SamplingParams \ No newline at end of file +from nanovllm.sampling_params import SamplingParams diff --git a/nanovllm/config.py b/nanovllm/config.py index 5d9dcc0..6c4e7f9 100644 --- a/nanovllm/config.py +++ b/nanovllm/config.py @@ -1,10 +1,11 @@ +import os from dataclasses import dataclass from transformers import AutoConfig @dataclass class Config: - model: str = '' + model: str max_num_batched_tokens: int = 32768 max_num_seqs: int = 512 max_model_len: int = 4096 @@ -17,5 +18,8 @@ class Config: num_kvcache_blocks: int = -1 def __post_init__(self): - assert self.model + assert os.path.isdir(self.model) assert self.kvcache_block_size % 256 == 0 + assert 1 <= self.tensor_parallel_size <= 8 + self.hf_config = AutoConfig.from_pretrained(self.model) + self.max_model_len = min(self.max_model_len, self.hf_config.max_position_embeddings) diff --git a/nanovllm/engine/llm_engine.py b/nanovllm/engine/llm_engine.py index c29ef65..53a4887 100644 --- a/nanovllm/engine/llm_engine.py +++ b/nanovllm/engine/llm_engine.py @@ -1,4 +1,5 @@ import atexit +from dataclasses import fields from time import perf_counter from tqdm.auto import tqdm from transformers import AutoConfig, AutoTokenizer @@ -14,13 +15,9 @@ from nanovllm.engine.model_runner import ModelRunner class LLMEngine: def __init__(self, model, **kwargs): - config = Config(model) - for k, v in kwargs.items(): - if hasattr(config, k): - setattr(config, k, v) - Sequence.block_size = config.kvcache_block_size - config.hf_config = AutoConfig.from_pretrained(config.model) - config.max_model_len = min(config.max_model_len, config.hf_config.max_position_embeddings) + config_fileds = {field.name for field in fields(Config)} + config_kwargs = {k: v for k, v in kwargs.items() if k in config_fileds} + config = Config(model, **config_kwargs) self.ps = [] self.events = [] for i in range(1, config.tensor_parallel_size): @@ -95,4 +92,4 @@ class LLMEngine: outputs = [{"text": self.tokenizer.decode(token_ids), "token_ids": token_ids} for token_ids in outputs] if use_tqdm: pbar.close() - return outputs \ No newline at end of file + return outputs diff --git a/nanovllm/engine/model_runner.py b/nanovllm/engine/model_runner.py index 33f429d..8973310 100644 --- a/nanovllm/engine/model_runner.py +++ b/nanovllm/engine/model_runner.py @@ -57,9 +57,7 @@ class ModelRunner: def loop(self): while True: method_name, args = self.read_shm() - method = getattr(self, method_name, None) - assert callable(method) - method(*args) + self.call(method_name, *args) if method_name == "exit": break @@ -82,8 +80,7 @@ class ModelRunner: event.set() def call(self, method_name, *args): - assert self.rank == 0 - if self.world_size > 1: + if self.world_size > 1 and self.rank == 0: self.write_shm(method_name, *args) method = getattr(self, method_name, None) assert callable(method) diff --git a/nanovllm/layers/activation.py b/nanovllm/layers/activation.py index 041ee20..8d026e1 100755 --- a/nanovllm/layers/activation.py +++ b/nanovllm/layers/activation.py @@ -11,4 +11,4 @@ class SiluAndMul(nn.Module): @torch.compile def forward(self, x: torch.Tensor) -> torch.Tensor: x, y = x.chunk(2, -1) - return F.silu(x) * y + return y.mul_(F.silu(x)) diff --git a/nanovllm/layers/embed_head.py b/nanovllm/layers/embed_head.py index 1ab7043..6422337 100644 --- a/nanovllm/layers/embed_head.py +++ b/nanovllm/layers/embed_head.py @@ -69,4 +69,4 @@ class ParallelLMHead(VocabParallelEmbedding): all_logits = [torch.empty_like(logits) for _ in range(self.tp_size)] if self.tp_rank == 0 else None dist.gather(logits, all_logits, 0) logits = torch.cat(all_logits, -1) if self.tp_rank == 0 else None - return logits \ No newline at end of file + return logits diff --git a/nanovllm/layers/rotary_embedding.py b/nanovllm/layers/rotary_embedding.py index 26ca5f9..1f483e7 100644 --- a/nanovllm/layers/rotary_embedding.py +++ b/nanovllm/layers/rotary_embedding.py @@ -70,4 +70,4 @@ def get_rope( ): assert rope_scaling is None rotary_emb = RotaryEmbedding(head_size, rotary_dim, max_position, base) - return rotary_emb \ No newline at end of file + return rotary_emb diff --git a/nanovllm/utils/context.py b/nanovllm/utils/context.py index d4cfd3d..65bbd18 100644 --- a/nanovllm/utils/context.py +++ b/nanovllm/utils/context.py @@ -25,4 +25,4 @@ def set_context(is_prefill, cu_seqlens_q=None, cu_seqlens_k=None, max_seqlen_q=0 def reset_context(): global _CONTEXT - _CONTEXT = Context() \ No newline at end of file + _CONTEXT = Context() diff --git a/nanovllm/utils/loader.py b/nanovllm/utils/loader.py index c052e0f..4ef8040 100644 --- a/nanovllm/utils/loader.py +++ b/nanovllm/utils/loader.py @@ -10,7 +10,6 @@ def default_weight_loader(param: nn.Parameter, loaded_weight: torch.Tensor): def load_model(model: nn.Module, path: str): - assert os.path.isdir(path) packed_modules_mapping = getattr(model, "packed_modules_mapping", {}) for file in glob(os.path.join(path, "*.safetensors")): with safe_open(file, "pt", "cpu") as f: diff --git a/pyproject.toml b/pyproject.toml index efb424b..bf3ecda 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "nano-vllm" -version = "0.1.0" +version = "0.2.0" authors = [{ name = "Xingkai Yu" }] license = "MIT" license-files = ["LICENSE"]