From b3685c9190e764b574bd58871e29fcab6a186d3b Mon Sep 17 00:00:00 2001
From: Zijie Tian <zijietian@mail.xmu.edu.cn>
Date: Sat, 3 Jan 2026 18:55:58 +0800
Subject: [PATCH] [test] Added test_align.py

---
 tests/test_align.py | 114 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 114 insertions(+)
 create mode 100644 tests/test_align.py

diff --git a/tests/test_align.py b/tests/test_align.py
new file mode 100644
index 0000000..0c42d52
--- /dev/null
+++ b/tests/test_align.py
@@ -0,0 +1,114 @@
+"""
+Test attention I/O observation with CPU offload.
+Uses hooks to observe attention layer inputs (Q, K, V) and outputs.
+"""
+
+import os
+os.environ["NANOVLLM_LOG_LEVEL"] = "WARNING"
+
+from nanovllm import LLM, SamplingParams
+from utils import generate_needle_prompt
+
+# Config
+MODEL_PATH = os.path.expanduser("~/models/Qwen3-0.6B/")
+MAX_MODEL_LEN = 32 * 1024
+NUM_GPU_BLOCKS = 4
+INPUT_LEN = 32 * 1024
+BLOCK_SIZE = 1024
+
+
+def make_attention_io_hook(layer_id: int, hook_type: str = "pre"):
+    """
+    Create hooks to inspect attention inputs/outputs.
+
+    Hook positions on decoder_layer.self_attn.attn:
+    - PRE HOOK inputs:  (q, k, v, ...) - Q/K/V tensors AFTER projection, AFTER RoPE
+    - POST HOOK output: attention_output tensor - shape [batch, seq_len, num_heads * head_dim]
+
+    Alternative hook position on decoder_layer.self_attn:
+    - PRE HOOK inputs:  (hidden_states, ...) - BEFORE Q/K/V projection
+    - POST HOOK output: (attn_output, attn_weights, past_key_value)
+    """
+    def pre_hook(module, inputs):
+        """
+        Attention input hook - captures Q, K, V tensors.
+
+        Position: decoder_layer.self_attn.attn (the Attention layer)
+        inputs[0] = Q tensor: [batch, seq_len, num_heads, head_dim]
+        inputs[1] = K tensor: [batch, seq_len, num_kv_heads, head_dim]
+        inputs[2] = V tensor: [batch, seq_len, num_kv_heads, head_dim]
+        """
+        if len(inputs) >= 3:
+            q, k, v = inputs[0], inputs[1], inputs[2]
+            print(f"\n[Layer {layer_id}] ATTENTION INPUT (pre-hook on self_attn.attn):")
+            print(f"  Q shape: {q.shape}, dtype: {q.dtype}, mean: {q.float().mean():.4f}")
+            print(f"  K shape: {k.shape}, dtype: {k.dtype}, mean: {k.float().mean():.4f}")
+            print(f"  V shape: {v.shape}, dtype: {v.dtype}, mean: {v.float().mean():.4f}")
+        return None  # Don't modify inputs
+
+    def post_hook(module, inputs, output):
+        """
+        Attention output hook - captures attention result.
+
+        Position: decoder_layer.self_attn.attn (the Attention layer)
+        output = attention_output tensor: [batch, seq_len, num_heads * head_dim]
+
+        NOTE: This is the output AFTER attention computation but BEFORE output projection.
+        """
+        # output can be tensor or tuple depending on implementation
+        if isinstance(output, tuple):
+            attn_output = output[0]
+        else:
+            attn_output = output
+        print(f"\n[Layer {layer_id}] ATTENTION OUTPUT (post-hook on self_attn.attn):")
+        print(f"  Output shape: {attn_output.shape}, dtype: {attn_output.dtype}")
+        print(f"  Output mean: {attn_output.float().mean():.4f}, std: {attn_output.float().std():.4f}")
+        return None  # Don't modify output
+
+    return pre_hook if hook_type == "pre" else post_hook
+
+
+# Main
+llm = LLM(
+    MODEL_PATH,
+    enforce_eager=True,
+    max_model_len=MAX_MODEL_LEN,
+    max_num_batched_tokens=MAX_MODEL_LEN,
+    enable_cpu_offload=True,
+    kvcache_block_size=BLOCK_SIZE,
+    num_gpu_blocks=NUM_GPU_BLOCKS,
+    dtype="float16",
+)
+
+# ============================================================
+# Register I/O hooks to inspect attention inputs/outputs
+# ============================================================
+# Only enable for first 2 layers to avoid excessive output
+io_hooks = []
+for layer_idx, decoder_layer in enumerate(llm.model_runner.model.model.layers):
+    if layer_idx >= 2:  # Only first 2 layers
+        break
+
+    # Position: decoder_layer.self_attn.attn (the Attention layer)
+    # - PRE hook sees: Q, K, V tensors (AFTER projection, AFTER RoPE)
+    # - POST hook sees: attention output (BEFORE output projection)
+    io_hooks.append(decoder_layer.self_attn.attn.register_forward_pre_hook(
+        make_attention_io_hook(layer_idx, "pre")
+    ))
+    io_hooks.append(decoder_layer.self_attn.attn.register_forward_hook(
+        make_attention_io_hook(layer_idx, "post")
+    ))
+
+prompt, expected = generate_needle_prompt(
+    tokenizer=llm.tokenizer,
+    target_length=INPUT_LEN,
+    needle_position=0.5,
+    needle_value="7492",
+    verbose=True,
+)
+outputs = llm.generate([prompt], SamplingParams(temperature=0.6, max_tokens=16), use_tqdm=False)
+
+for hook in io_hooks:
+    hook.remove()
+
+print("test_align: PASSED")