From e09a2a5b10fc8067dd6e1b42b2a0000c98c46f17 Mon Sep 17 00:00:00 2001 From: Zijie Tian Date: Wed, 28 Jan 2026 13:44:32 +0800 Subject: [PATCH] =?UTF-8?q?=E2=9C=A8=20feat:=20add=20Qwen2/2.5=20model=20s?= =?UTF-8?q?upport?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Separate Qwen2 from Qwen3 implementation: - Qwen2: Uses QKV bias, no QK norm - Qwen3: Has optional QK norm when no bias Tested with Qwen2.5-7B-Instruct-1M, RULER niah_single_1 passed. Co-Authored-By: Claude Opus 4.5 --- nanovllm/models/__init__.py | 1 + nanovllm/models/qwen2.py | 207 ++++++++++++++++++++++++++++++++++++ nanovllm/models/qwen3.py | 2 +- 3 files changed, 209 insertions(+), 1 deletion(-) create mode 100644 nanovllm/models/qwen2.py diff --git a/nanovllm/models/__init__.py b/nanovllm/models/__init__.py index a865c1a..f8b6a60 100644 --- a/nanovllm/models/__init__.py +++ b/nanovllm/models/__init__.py @@ -3,6 +3,7 @@ from nanovllm.models.registry import register_model, get_model_class, MODEL_REGISTRY # Import models to trigger registration +from nanovllm.models import qwen2 from nanovllm.models import qwen3 from nanovllm.models import llama from nanovllm.models import glm4 diff --git a/nanovllm/models/qwen2.py b/nanovllm/models/qwen2.py new file mode 100644 index 0000000..295babb --- /dev/null +++ b/nanovllm/models/qwen2.py @@ -0,0 +1,207 @@ +import torch +from torch import nn +import torch.distributed as dist +from transformers import Qwen2Config + +from nanovllm.layers.activation import SiluAndMul +from nanovllm.layers.attention import Attention +from nanovllm.layers.layernorm import RMSNorm +from nanovllm.layers.linear import QKVParallelLinear, MergedColumnParallelLinear, RowParallelLinear +from nanovllm.layers.rotary_embedding import get_rope +from nanovllm.layers.embed_head import VocabParallelEmbedding, ParallelLMHead +from nanovllm.models.registry import register_model + + +class Qwen2Attention(nn.Module): + """Qwen2/2.5 Attention without QK norm (unlike Qwen3).""" + + def __init__( + self, + hidden_size: int, + num_heads: int, + num_kv_heads: int, + max_position: int = 4096 * 32, + head_dim: int | None = None, + rope_theta: float = 10000, + rope_scaling: tuple | None = None, + ) -> None: + super().__init__() + tp_size = dist.get_world_size() + self.total_num_heads = num_heads + assert self.total_num_heads % tp_size == 0 + self.num_heads = self.total_num_heads // tp_size + self.total_num_kv_heads = num_kv_heads + assert self.total_num_kv_heads % tp_size == 0 + self.num_kv_heads = self.total_num_kv_heads // tp_size + self.head_dim = head_dim or hidden_size // self.total_num_heads + self.q_size = self.num_heads * self.head_dim + self.kv_size = self.num_kv_heads * self.head_dim + self.scaling = self.head_dim ** -0.5 + + self.qkv_proj = QKVParallelLinear( + hidden_size, + self.head_dim, + self.total_num_heads, + self.total_num_kv_heads, + bias=True, # Qwen2/2.5 always uses bias + ) + self.o_proj = RowParallelLinear( + self.total_num_heads * self.head_dim, + hidden_size, + bias=False, + ) + self.rotary_emb = get_rope( + self.head_dim, + rotary_dim=self.head_dim, + max_position=max_position, + base=rope_theta, + rope_scaling=rope_scaling, + ) + self.attn = Attention( + self.num_heads, + self.head_dim, + self.scaling, + self.num_kv_heads, + ) + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + ) -> torch.Tensor: + qkv = self.qkv_proj(hidden_states) + q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) + q = q.view(-1, self.num_heads, self.head_dim) + k = k.view(-1, self.num_kv_heads, self.head_dim) + v = v.view(-1, self.num_kv_heads, self.head_dim) + q, k = self.rotary_emb(positions, q, k) + o = self.attn(q, k, v) + output = self.o_proj(o.flatten(1, -1)) + return output + + +class Qwen2MLP(nn.Module): + + def __init__( + self, + hidden_size: int, + intermediate_size: int, + hidden_act: str, + ) -> None: + super().__init__() + self.gate_up_proj = MergedColumnParallelLinear( + hidden_size, + [intermediate_size] * 2, + bias=False, + ) + self.down_proj = RowParallelLinear( + intermediate_size, + hidden_size, + bias=False, + ) + assert hidden_act == "silu" + self.act_fn = SiluAndMul() + + def forward(self, x): + gate_up = self.gate_up_proj(x) + x = self.act_fn(gate_up) + x = self.down_proj(x) + return x + + +class Qwen2DecoderLayer(nn.Module): + + def __init__( + self, + config: Qwen2Config, + ) -> None: + super().__init__() + self.self_attn = Qwen2Attention( + hidden_size=config.hidden_size, + num_heads=config.num_attention_heads, + num_kv_heads=config.num_key_value_heads, + max_position=config.max_position_embeddings, + head_dim=getattr(config, 'head_dim', None), + rope_theta=getattr(config, "rope_theta", 1000000), + rope_scaling=getattr(config, "rope_scaling", None), + ) + self.mlp = Qwen2MLP( + hidden_size=config.hidden_size, + intermediate_size=config.intermediate_size, + hidden_act=config.hidden_act, + ) + self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.post_attention_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + residual: torch.Tensor | None, + ) -> tuple[torch.Tensor, torch.Tensor]: + if residual is None: + hidden_states, residual = self.input_layernorm(hidden_states), hidden_states + else: + hidden_states, residual = self.input_layernorm(hidden_states, residual) + hidden_states = self.self_attn(positions, hidden_states) + hidden_states, residual = self.post_attention_layernorm(hidden_states, residual) + hidden_states = self.mlp(hidden_states) + return hidden_states, residual + + +class Qwen2Model(nn.Module): + + def __init__( + self, + config: Qwen2Config, + ) -> None: + super().__init__() + self.embed_tokens = VocabParallelEmbedding(config.vocab_size, config.hidden_size) + self.layers = nn.ModuleList([Qwen2DecoderLayer(config) for _ in range(config.num_hidden_layers)]) + self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + ) -> torch.Tensor: + hidden_states = self.embed_tokens(input_ids) + residual = None + for layer in self.layers: + hidden_states, residual = layer(positions, hidden_states, residual) + hidden_states, _ = self.norm(hidden_states, residual) + return hidden_states + + +@register_model("Qwen2ForCausalLM") +class Qwen2ForCausalLM(nn.Module): + packed_modules_mapping = { + "q_proj": ("qkv_proj", "q"), + "k_proj": ("qkv_proj", "k"), + "v_proj": ("qkv_proj", "v"), + "gate_proj": ("gate_up_proj", 0), + "up_proj": ("gate_up_proj", 1), + } + + def __init__( + self, + config: Qwen2Config + ) -> None: + super().__init__() + self.model = Qwen2Model(config) + self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size) + if config.tie_word_embeddings: + self.lm_head.weight.data = self.model.embed_tokens.weight.data + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + ) -> torch.Tensor: + return self.model(input_ids, positions) + + def compute_logits( + self, + hidden_states: torch.Tensor, + ) -> torch.Tensor: + return self.lm_head(hidden_states) diff --git a/nanovllm/models/qwen3.py b/nanovllm/models/qwen3.py index b4e8413..64f8d48 100755 --- a/nanovllm/models/qwen3.py +++ b/nanovllm/models/qwen3.py @@ -187,7 +187,7 @@ class Qwen3Model(nn.Module): return hidden_states -@register_model("Qwen3ForCausalLM", "Qwen2ForCausalLM") +@register_model("Qwen3ForCausalLM") class Qwen3ForCausalLM(nn.Module): packed_modules_mapping = { "q_proj": ("qkv_proj", "q"),