"""
Custom Qwen3 implementation using only torch and transformers.
This file provides a clean reference implementation for understanding the model computation graph.

Computation Graph:
==================

Input: token_ids [batch, seq_len]
         │
         ▼
    ┌─────────────┐
    │  Embedding  │  embed_tokens: [vocab_size, hidden_size]
    └─────────────┘
         │
         ▼
    hidden_states [batch, seq_len, hidden_size]
         │
         ▼
    ┌─────────────────────────────────────────────────────────┐
    │                   Decoder Layer (x N)                   │
    │  ┌───────────────────────────────────────────────────┐  │
    │  │              Self Attention Block                 │  │
    │  │                                                   │  │
    │  │  input_layernorm (RMSNorm)                        │  │
    │  │         │                                         │  │
    │  │         ▼                                         │  │
    │  │  ┌─────────────────────────────────────────────┐  │  │
    │  │  │            Qwen3Attention                   │  │  │
    │  │  │  Q = q_proj(x) → q_norm → reshape           │  │  │
    │  │  │  K = k_proj(x) → k_norm → reshape           │  │  │
    │  │  │  V = v_proj(x) → reshape                    │  │  │
    │  │  │         │                                   │  │  │
    │  │  │         ▼                                   │  │  │
    │  │  │  Q, K = apply_rotary_pos_emb(Q, K, cos, sin)│  │  │
    │  │  │         │                                   │  │  │
    │  │  │         ▼                                   │  │  │
    │  │  │  attn_output = attention(Q, K, V)           │  │  │
    │  │  │         │                                   │  │  │
    │  │  │         ▼                                   │  │  │
    │  │  │  output = o_proj(attn_output)               │  │  │
    │  │  └─────────────────────────────────────────────┘  │  │
    │  │         │                                         │  │
    │  │         ▼                                         │  │
    │  │  hidden_states = residual + attn_output           │  │
    │  └───────────────────────────────────────────────────┘  │
    │                        │                                │
    │                        ▼                                │
    │  ┌───────────────────────────────────────────────────┐  │
    │  │                  MLP Block                        │  │
    │  │                                                   │  │
    │  │  post_attention_layernorm (RMSNorm)               │  │
    │  │         │                                         │  │
    │  │         ▼                                         │  │
    │  │  ┌─────────────────────────────────────────────┐  │  │
    │  │  │              Qwen3MLP                       │  │  │
    │  │  │  gate = gate_proj(x)                        │  │  │
    │  │  │  up = up_proj(x)                            │  │  │
    │  │  │  output = down_proj(silu(gate) * up)        │  │  │
    │  │  └─────────────────────────────────────────────┘  │  │
    │  │         │                                         │  │
    │  │         ▼                                         │  │
    │  │  hidden_states = residual + mlp_output            │  │
    │  └───────────────────────────────────────────────────┘  │
    └─────────────────────────────────────────────────────────┘
         │
         ▼
    ┌─────────────┐
    │    norm     │  final RMSNorm
    └─────────────┘
         │
         ▼
    ┌─────────────┐
    │   lm_head   │  [hidden_size, vocab_size]
    └─────────────┘
         │
         ▼
    logits [batch, seq_len, vocab_size]
"""

import math
from typing import Optional, Tuple, List
import torch
import torch.nn as nn
import torch.nn.functional as F


class Qwen3RMSNorm(nn.Module):
    """RMSNorm implementation."""

    def __init__(self, hidden_size: int, eps: float = 1e-6):
        super().__init__()
        self.weight = nn.Parameter(torch.ones(hidden_size))
        self.eps = eps

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        input_dtype = x.dtype
        x = x.float()
        variance = x.pow(2).mean(-1, keepdim=True)
        x = x * torch.rsqrt(variance + self.eps)
        return self.weight * x.to(input_dtype)


class Qwen3RotaryEmbedding(nn.Module):
    """Rotary Position Embedding (RoPE)."""

    def __init__(self, dim: int, max_position_embeddings: int = 32768, base: float = 10000.0):
        super().__init__()
        self.dim = dim
        self.max_position_embeddings = max_position_embeddings
        self.base = base

        # Compute inverse frequencies
        inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float() / dim))
        self.register_buffer("inv_freq", inv_freq, persistent=False)

    @torch.no_grad()
    def forward(self, x: torch.Tensor, position_ids: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
        """
        Args:
            x: Input tensor [batch, seq_len, num_heads, head_dim] or similar
            position_ids: Position indices [batch, seq_len]

        Returns:
            cos, sin: [batch, seq_len, head_dim]
        """
        # inv_freq: [dim/2]
        # position_ids: [batch, seq_len]
        inv_freq_expanded = self.inv_freq[None, :, None].float()  # [1, dim/2, 1]
        position_ids_expanded = position_ids[:, None, :].float()  # [batch, 1, seq_len]

        # freqs: [batch, dim/2, seq_len]
        freqs = inv_freq_expanded @ position_ids_expanded
        # freqs: [batch, seq_len, dim/2]
        freqs = freqs.transpose(1, 2)

        # Duplicate for full head_dim: [batch, seq_len, dim]
        emb = torch.cat((freqs, freqs), dim=-1)

        cos = emb.cos().to(x.dtype)
        sin = emb.sin().to(x.dtype)

        return cos, sin


def rotate_half(x: torch.Tensor) -> torch.Tensor:
    """Rotate half the hidden dims of the input."""
    x1 = x[..., : x.shape[-1] // 2]
    x2 = x[..., x.shape[-1] // 2 :]
    return torch.cat((-x2, x1), dim=-1)


def apply_rotary_pos_emb(
    q: torch.Tensor,
    k: torch.Tensor,
    cos: torch.Tensor,
    sin: torch.Tensor,
) -> Tuple[torch.Tensor, torch.Tensor]:
    """
    Apply rotary position embeddings to Q and K.

    Args:
        q: [batch, num_heads, seq_len, head_dim]
        k: [batch, num_kv_heads, seq_len, head_dim]
        cos: [batch, seq_len, head_dim]
        sin: [batch, seq_len, head_dim]

    Returns:
        q_embed, k_embed with same shapes as inputs
    """
    # Unsqueeze for broadcasting: [batch, 1, seq_len, head_dim]
    cos = cos.unsqueeze(1)
    sin = sin.unsqueeze(1)

    q_embed = (q * cos) + (rotate_half(q) * sin)
    k_embed = (k * cos) + (rotate_half(k) * sin)

    return q_embed, k_embed


class Qwen3Attention(nn.Module):
    """
    Qwen3 Multi-Head Attention with Grouped Query Attention (GQA) support.

    Data Flow:
    ---------
    hidden_states [batch, seq_len, hidden_size]
            │
            ├──► q_proj ──► q_norm ──► reshape ──► Q [batch, num_heads, seq_len, head_dim]
            ├──► k_proj ──► k_norm ──► reshape ──► K [batch, num_kv_heads, seq_len, head_dim]
            └──► v_proj ──► reshape ──► V [batch, num_kv_heads, seq_len, head_dim]
                    │
                    ▼
            apply_rotary_pos_emb(Q, K)
                    │
                    ▼
            attention(Q, K, V) ──► attn_output [batch, num_heads, seq_len, head_dim]
                    │
                    ▼
            reshape ──► o_proj ──► output [batch, seq_len, hidden_size]
    """

    def __init__(
        self,
        hidden_size: int,
        num_attention_heads: int,
        num_key_value_heads: int,
        head_dim: int,
        max_position_embeddings: int = 32768,
        rope_theta: float = 10000.0,
        attention_bias: bool = False,
        rms_norm_eps: float = 1e-6,
        layer_idx: int = 0,
    ):
        super().__init__()
        self.hidden_size = hidden_size
        self.num_heads = num_attention_heads
        self.num_kv_heads = num_key_value_heads
        self.head_dim = head_dim
        self.num_kv_groups = num_attention_heads // num_key_value_heads
        self.layer_idx = layer_idx

        # Scaling factor
        self.scaling = head_dim ** -0.5

        # QKV projections
        self.q_proj = nn.Linear(hidden_size, num_attention_heads * head_dim, bias=attention_bias)
        self.k_proj = nn.Linear(hidden_size, num_key_value_heads * head_dim, bias=attention_bias)
        self.v_proj = nn.Linear(hidden_size, num_key_value_heads * head_dim, bias=attention_bias)
        self.o_proj = nn.Linear(num_attention_heads * head_dim, hidden_size, bias=attention_bias)

        # QK normalization (Qwen3 specific)
        self.q_norm = Qwen3RMSNorm(head_dim, eps=rms_norm_eps)
        self.k_norm = Qwen3RMSNorm(head_dim, eps=rms_norm_eps)

        # Rotary embeddings
        self.rotary_emb = Qwen3RotaryEmbedding(
            head_dim,
            max_position_embeddings=max_position_embeddings,
            base=rope_theta,
        )

    def forward(
        self,
        hidden_states: torch.Tensor,
        position_ids: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        past_key_value: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
        use_cache: bool = False,
        output_qkv: bool = False,
    ) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, torch.Tensor]], Optional[dict]]:
        """
        Args:
            hidden_states: [batch, seq_len, hidden_size]
            position_ids: [batch, seq_len]
            attention_mask: [batch, 1, seq_len, kv_seq_len] (causal mask)
            past_key_value: (k_cache, v_cache) from previous steps
            use_cache: Whether to return updated cache
            output_qkv: Whether to output Q, K, V tensors for debugging

        Returns:
            output: [batch, seq_len, hidden_size]
            past_key_value: Updated cache (if use_cache=True)
            qkv_dict: {"q": Q, "k": K, "v": V} (if output_qkv=True)
        """
        batch_size, seq_len, _ = hidden_states.shape

        # === QKV Projections ===
        q = self.q_proj(hidden_states)  # [batch, seq_len, num_heads * head_dim]
        k = self.k_proj(hidden_states)  # [batch, seq_len, num_kv_heads * head_dim]
        v = self.v_proj(hidden_states)  # [batch, seq_len, num_kv_heads * head_dim]

        # Reshape to [batch, seq_len, num_heads, head_dim]
        q = q.view(batch_size, seq_len, self.num_heads, self.head_dim)
        k = k.view(batch_size, seq_len, self.num_kv_heads, self.head_dim)
        v = v.view(batch_size, seq_len, self.num_kv_heads, self.head_dim)

        # === QK Normalization (Qwen3 specific) ===
        q = self.q_norm(q)
        k = self.k_norm(k)

        # Transpose to [batch, num_heads, seq_len, head_dim]
        q = q.transpose(1, 2)
        k = k.transpose(1, 2)
        v = v.transpose(1, 2)

        # === Rotary Position Embeddings ===
        cos, sin = self.rotary_emb(v, position_ids)
        q, k = apply_rotary_pos_emb(q, k, cos, sin)

        # === KV Cache Update ===
        if past_key_value is not None:
            k_cache, v_cache = past_key_value
            k = torch.cat([k_cache, k], dim=2)
            v = torch.cat([v_cache, v], dim=2)

        new_past_key_value = (k, v) if use_cache else None

        # === Grouped Query Attention (expand KV heads if needed) ===
        if self.num_kv_groups > 1:
            # Repeat KV for each query group
            k = k.repeat_interleave(self.num_kv_groups, dim=1)
            v = v.repeat_interleave(self.num_kv_groups, dim=1)

        # === Attention Computation (using SDPA for memory efficiency) ===
        # Use PyTorch's scaled_dot_product_attention which can use FlashAttention backend
        # is_causal only works when q_len == kv_len (prefill), not during decode
        q_len, kv_len = q.shape[2], k.shape[2]
        is_causal = (q_len == kv_len) and (q_len > 1)

        attn_output = F.scaled_dot_product_attention(
            q, k, v,
            attn_mask=None,
            dropout_p=0.0,
            is_causal=is_causal,
            scale=self.scaling,
        )  # [batch, num_heads, seq_len, head_dim]

        # === Output Projection ===
        # Transpose back and reshape
        attn_output = attn_output.transpose(1, 2).contiguous()  # [batch, seq_len, num_heads, head_dim]
        attn_output = attn_output.view(batch_size, seq_len, -1)  # [batch, seq_len, hidden_size]
        output = self.o_proj(attn_output)

        # Optional QKV output for debugging
        qkv_dict = None
        if output_qkv:
            qkv_dict = {
                "q": q,  # [batch, num_heads, seq_len, head_dim] (post-RoPE)
                "k": k,  # [batch, num_heads, kv_seq_len, head_dim] (post-RoPE, expanded)
                "v": v,  # [batch, num_heads, kv_seq_len, head_dim] (expanded)
            }

        return output, new_past_key_value, qkv_dict


class Qwen3MLP(nn.Module):
    """
    Qwen3 MLP with SwiGLU activation.

    Data Flow:
    ---------
    hidden_states [batch, seq_len, hidden_size]
            │
            ├──► gate_proj ──► gate [batch, seq_len, intermediate_size]
            │
            └──► up_proj ──► up [batch, seq_len, intermediate_size]
                    │
                    ▼
            silu(gate) * up
                    │
                    ▼
            down_proj ──► output [batch, seq_len, hidden_size]
    """

    def __init__(self, hidden_size: int, intermediate_size: int, bias: bool = False):
        super().__init__()
        self.gate_proj = nn.Linear(hidden_size, intermediate_size, bias=bias)
        self.up_proj = nn.Linear(hidden_size, intermediate_size, bias=bias)
        self.down_proj = nn.Linear(intermediate_size, hidden_size, bias=bias)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        gate = self.gate_proj(x)
        up = self.up_proj(x)
        return self.down_proj(F.silu(gate) * up)


class Qwen3DecoderLayer(nn.Module):
    """Single Qwen3 Decoder Layer."""

    def __init__(
        self,
        hidden_size: int,
        intermediate_size: int,
        num_attention_heads: int,
        num_key_value_heads: int,
        head_dim: int,
        max_position_embeddings: int = 32768,
        rope_theta: float = 10000.0,
        rms_norm_eps: float = 1e-6,
        attention_bias: bool = False,
        mlp_bias: bool = False,
        layer_idx: int = 0,
    ):
        super().__init__()
        self.layer_idx = layer_idx

        # Pre-attention LayerNorm
        self.input_layernorm = Qwen3RMSNorm(hidden_size, eps=rms_norm_eps)

        # Self-attention
        self.self_attn = Qwen3Attention(
            hidden_size=hidden_size,
            num_attention_heads=num_attention_heads,
            num_key_value_heads=num_key_value_heads,
            head_dim=head_dim,
            max_position_embeddings=max_position_embeddings,
            rope_theta=rope_theta,
            attention_bias=attention_bias,
            rms_norm_eps=rms_norm_eps,
            layer_idx=layer_idx,
        )

        # Post-attention LayerNorm
        self.post_attention_layernorm = Qwen3RMSNorm(hidden_size, eps=rms_norm_eps)

        # MLP
        self.mlp = Qwen3MLP(hidden_size, intermediate_size, bias=mlp_bias)

    def forward(
        self,
        hidden_states: torch.Tensor,
        position_ids: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        past_key_value: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
        use_cache: bool = False,
        output_qkv: bool = False,
    ) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, torch.Tensor]], Optional[dict]]:
        """
        Args:
            hidden_states: [batch, seq_len, hidden_size]
            position_ids: [batch, seq_len]
            attention_mask: Causal attention mask
            past_key_value: KV cache for this layer
            use_cache: Whether to return updated cache
            output_qkv: Whether to output Q, K, V for debugging

        Returns:
            hidden_states: [batch, seq_len, hidden_size]
            past_key_value: Updated cache
            qkv_dict: QKV tensors (if output_qkv=True)
        """
        # === Self Attention Block ===
        residual = hidden_states
        hidden_states = self.input_layernorm(hidden_states)

        attn_output, new_past_key_value, qkv_dict = self.self_attn(
            hidden_states=hidden_states,
            position_ids=position_ids,
            attention_mask=attention_mask,
            past_key_value=past_key_value,
            use_cache=use_cache,
            output_qkv=output_qkv,
        )

        hidden_states = residual + attn_output

        # === MLP Block ===
        residual = hidden_states
        hidden_states = self.post_attention_layernorm(hidden_states)
        hidden_states = self.mlp(hidden_states)
        hidden_states = residual + hidden_states

        return hidden_states, new_past_key_value, qkv_dict


class Qwen3Model(nn.Module):
    """Qwen3 Transformer Model (without LM head)."""

    def __init__(
        self,
        vocab_size: int,
        hidden_size: int,
        intermediate_size: int,
        num_hidden_layers: int,
        num_attention_heads: int,
        num_key_value_heads: int,
        head_dim: int,
        max_position_embeddings: int = 32768,
        rope_theta: float = 10000.0,
        rms_norm_eps: float = 1e-6,
        attention_bias: bool = False,
        mlp_bias: bool = False,
    ):
        super().__init__()
        self.vocab_size = vocab_size
        self.num_hidden_layers = num_hidden_layers

        # Token embeddings
        self.embed_tokens = nn.Embedding(vocab_size, hidden_size)

        # Decoder layers
        self.layers = nn.ModuleList([
            Qwen3DecoderLayer(
                hidden_size=hidden_size,
                intermediate_size=intermediate_size,
                num_attention_heads=num_attention_heads,
                num_key_value_heads=num_key_value_heads,
                head_dim=head_dim,
                max_position_embeddings=max_position_embeddings,
                rope_theta=rope_theta,
                rms_norm_eps=rms_norm_eps,
                attention_bias=attention_bias,
                mlp_bias=mlp_bias,
                layer_idx=i,
            )
            for i in range(num_hidden_layers)
        ])

        # Final LayerNorm
        self.norm = Qwen3RMSNorm(hidden_size, eps=rms_norm_eps)

    def forward(
        self,
        input_ids: torch.Tensor,
        position_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        past_key_values: Optional[List[Tuple[torch.Tensor, torch.Tensor]]] = None,
        use_cache: bool = False,
        output_qkv_layers: Optional[List[int]] = None,
    ) -> Tuple[torch.Tensor, Optional[List], Optional[dict]]:
        """
        Args:
            input_ids: [batch, seq_len]
            position_ids: [batch, seq_len]
            attention_mask: [batch, seq_len] or pre-computed 4D mask
            past_key_values: List of (k, v) tuples for each layer
            use_cache: Whether to return new cache
            output_qkv_layers: List of layer indices to output QKV for

        Returns:
            hidden_states: [batch, seq_len, hidden_size]
            new_past_key_values: Updated cache
            qkv_outputs: {layer_idx: qkv_dict}
        """
        batch_size, seq_len = input_ids.shape

        # Embedding
        hidden_states = self.embed_tokens(input_ids)

        # Position IDs
        if position_ids is None:
            past_len = past_key_values[0][0].shape[2] if past_key_values else 0
            position_ids = torch.arange(past_len, past_len + seq_len, device=input_ids.device)
            position_ids = position_ids.unsqueeze(0).expand(batch_size, -1)

        # Attention mask (create causal mask if not provided)
        if attention_mask is None or attention_mask.dim() == 2:
            kv_seq_len = seq_len + (past_key_values[0][0].shape[2] if past_key_values else 0)
            causal_mask = torch.triu(
                torch.full((seq_len, kv_seq_len), float("-inf"), device=input_ids.device),
                diagonal=kv_seq_len - seq_len + 1,
            )
            attention_mask = causal_mask.unsqueeze(0).unsqueeze(0)  # [1, 1, seq_len, kv_seq_len]

        # Initialize cache list
        new_past_key_values = [] if use_cache else None
        qkv_outputs = {} if output_qkv_layers else None

        # Decoder layers
        for i, layer in enumerate(self.layers):
            past_kv = past_key_values[i] if past_key_values else None
            output_qkv = output_qkv_layers is not None and i in output_qkv_layers

            hidden_states, new_kv, qkv_dict = layer(
                hidden_states=hidden_states,
                position_ids=position_ids,
                attention_mask=attention_mask,
                past_key_value=past_kv,
                use_cache=use_cache,
                output_qkv=output_qkv,
            )

            if use_cache:
                new_past_key_values.append(new_kv)
            if qkv_dict is not None:
                qkv_outputs[i] = qkv_dict

        # Final norm
        hidden_states = self.norm(hidden_states)

        return hidden_states, new_past_key_values, qkv_outputs


class Qwen3ForCausalLM(nn.Module):
    """Qwen3 Model with Language Modeling head."""

    def __init__(
        self,
        vocab_size: int,
        hidden_size: int,
        intermediate_size: int,
        num_hidden_layers: int,
        num_attention_heads: int,
        num_key_value_heads: int,
        head_dim: int,
        max_position_embeddings: int = 32768,
        rope_theta: float = 10000.0,
        rms_norm_eps: float = 1e-6,
        attention_bias: bool = False,
        mlp_bias: bool = False,
        tie_word_embeddings: bool = True,
    ):
        super().__init__()
        self.vocab_size = vocab_size
        self.tie_word_embeddings = tie_word_embeddings

        # Transformer model
        self.model = Qwen3Model(
            vocab_size=vocab_size,
            hidden_size=hidden_size,
            intermediate_size=intermediate_size,
            num_hidden_layers=num_hidden_layers,
            num_attention_heads=num_attention_heads,
            num_key_value_heads=num_key_value_heads,
            head_dim=head_dim,
            max_position_embeddings=max_position_embeddings,
            rope_theta=rope_theta,
            rms_norm_eps=rms_norm_eps,
            attention_bias=attention_bias,
            mlp_bias=mlp_bias,
        )

        # LM head
        self.lm_head = nn.Linear(hidden_size, vocab_size, bias=False)

    def forward(
        self,
        input_ids: torch.Tensor,
        position_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        past_key_values: Optional[List[Tuple[torch.Tensor, torch.Tensor]]] = None,
        use_cache: bool = False,
        output_qkv_layers: Optional[List[int]] = None,
    ) -> Tuple[torch.Tensor, Optional[List], Optional[dict]]:
        """
        Args:
            input_ids: [batch, seq_len]
            ... (same as Qwen3Model)

        Returns:
            logits: [batch, seq_len, vocab_size]
            past_key_values: Updated KV cache
            qkv_outputs: QKV tensors for specified layers
        """
        hidden_states, new_past_key_values, qkv_outputs = self.model(
            input_ids=input_ids,
            position_ids=position_ids,
            attention_mask=attention_mask,
            past_key_values=past_key_values,
            use_cache=use_cache,
            output_qkv_layers=output_qkv_layers,
        )

        logits = self.lm_head(hidden_states)

        return logits, new_past_key_values, qkv_outputs

    @classmethod
    def from_pretrained(cls, model_path: str, dtype: torch.dtype = torch.float16) -> "Qwen3ForCausalLM":
        """
        Load weights from a pretrained Qwen3 model.

        Args:
            model_path: Path to model directory containing config.json and model weights
            dtype: Data type for model weights

        Returns:
            Initialized Qwen3ForCausalLM model
        """
        import json
        import os
        from safetensors.torch import load_file

        # Load config
        config_path = os.path.join(model_path, "config.json")
        with open(config_path) as f:
            config = json.load(f)

        # Create model
        model = cls(
            vocab_size=config["vocab_size"],
            hidden_size=config["hidden_size"],
            intermediate_size=config["intermediate_size"],
            num_hidden_layers=config["num_hidden_layers"],
            num_attention_heads=config["num_attention_heads"],
            num_key_value_heads=config.get("num_key_value_heads", config["num_attention_heads"]),
            head_dim=config.get("head_dim", config["hidden_size"] // config["num_attention_heads"]),
            max_position_embeddings=config.get("max_position_embeddings", 32768),
            rope_theta=config.get("rope_theta", 10000.0),
            rms_norm_eps=config.get("rms_norm_eps", 1e-6),
            attention_bias=config.get("attention_bias", False),
            mlp_bias=config.get("mlp_bias", False),
            tie_word_embeddings=config.get("tie_word_embeddings", True),
        )

        # Load weights
        weight_files = sorted([
            f for f in os.listdir(model_path)
            if f.endswith(".safetensors")
        ])

        state_dict = {}
        for wf in weight_files:
            state_dict.update(load_file(os.path.join(model_path, wf)))

        # Load into model
        model.load_state_dict(state_dict, strict=False)

        # Tie lm_head weights to embed_tokens if configured
        if model.tie_word_embeddings:
            model.lm_head.weight = model.model.embed_tokens.weight

        model = model.to(dtype)

        return model

    @torch.no_grad()
    def generate(
        self,
        input_ids: torch.Tensor,
        max_new_tokens: int = 32,
        temperature: float = 1.0,
        do_sample: bool = True,
        pad_token_id: Optional[int] = None,
        eos_token_id: Optional[int] = None,
    ) -> torch.Tensor:
        """Simple autoregressive generation."""
        device = input_ids.device
        batch_size, seq_len = input_ids.shape
        past_key_values = None
        generated = input_ids.clone()

        for _ in range(max_new_tokens):
            if past_key_values is None:
                current_input = generated
            else:
                current_input = generated[:, -1:]

            logits, past_key_values, _ = self(
                input_ids=current_input,
                past_key_values=past_key_values,
                use_cache=True,
            )

            next_token_logits = logits[:, -1, :]
            if temperature > 0 and do_sample:
                next_token_logits = next_token_logits / temperature
                probs = torch.softmax(next_token_logits, dim=-1)
                next_token = torch.multinomial(probs, num_samples=1)
            else:
                next_token = next_token_logits.argmax(dim=-1, keepdim=True)

            generated = torch.cat([generated, next_token], dim=1)

            if eos_token_id is not None and (next_token == eos_token_id).all():
                break

        return generated


def print_computation_graph():
    """Print the computation graph for reference."""
    print(__doc__)


if __name__ == "__main__":
    print_computation_graph()