Merge branch 'zijie/add-llama-1': Add multi-model support

- Add model registry system for dynamic model loading - Implement LlamaForCausalLM with Llama3 RoPE scaling - Register Qwen3ForCausalLM and Qwen2ForCausalLM - Update ModelRunner to use get_model_class() for dynamic model selection Tested: needle 32k test PASSED Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-10 21:20:53 +08:00
parent 9377ff63fe 24f5ae5fc3
commit e23be2e844
10 changed files with 947 additions and 7 deletions
--- a/nanovllm/engine/model_runner.py
+++ b/nanovllm/engine/model_runner.py
@@ -6,7 +6,7 @@ from multiprocessing.shared_memory import SharedMemory

 from nanovllm.config import Config, SparsePolicyType
 from nanovllm.engine.sequence import Sequence
-from nanovllm.models.qwen3 import Qwen3ForCausalLM
+from nanovllm.models import get_model_class
 from nanovllm.layers.sampler import GreedySampler
 from nanovllm.utils.context import set_context, get_context, reset_context
 from nanovllm.utils.loader import load_model
@@ -32,7 +32,8 @@ class ModelRunner:
        default_dtype = torch.get_default_dtype()
        torch.set_default_dtype(hf_config.torch_dtype)
        torch.set_default_device("cuda")
-        self.model = Qwen3ForCausalLM(hf_config)
+        model_class = get_model_class(hf_config)
+        self.model = model_class(hf_config)
        load_model(self.model, config.model)
        self.sampler = GreedySampler()