From b3685c9190e764b574bd58871e29fcab6a186d3b Mon Sep 17 00:00:00 2001 From: Zijie Tian Date: Sat, 3 Jan 2026 18:55:58 +0800 Subject: [PATCH] [test] Added test_align.py --- tests/test_align.py | 114 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 114 insertions(+) create mode 100644 tests/test_align.py diff --git a/tests/test_align.py b/tests/test_align.py new file mode 100644 index 0000000..0c42d52 --- /dev/null +++ b/tests/test_align.py @@ -0,0 +1,114 @@ +""" +Test attention I/O observation with CPU offload. +Uses hooks to observe attention layer inputs (Q, K, V) and outputs. +""" + +import os +os.environ["NANOVLLM_LOG_LEVEL"] = "WARNING" + +from nanovllm import LLM, SamplingParams +from utils import generate_needle_prompt + +# Config +MODEL_PATH = os.path.expanduser("~/models/Qwen3-0.6B/") +MAX_MODEL_LEN = 32 * 1024 +NUM_GPU_BLOCKS = 4 +INPUT_LEN = 32 * 1024 +BLOCK_SIZE = 1024 + + +def make_attention_io_hook(layer_id: int, hook_type: str = "pre"): + """ + Create hooks to inspect attention inputs/outputs. + + Hook positions on decoder_layer.self_attn.attn: + - PRE HOOK inputs: (q, k, v, ...) - Q/K/V tensors AFTER projection, AFTER RoPE + - POST HOOK output: attention_output tensor - shape [batch, seq_len, num_heads * head_dim] + + Alternative hook position on decoder_layer.self_attn: + - PRE HOOK inputs: (hidden_states, ...) - BEFORE Q/K/V projection + - POST HOOK output: (attn_output, attn_weights, past_key_value) + """ + def pre_hook(module, inputs): + """ + Attention input hook - captures Q, K, V tensors. + + Position: decoder_layer.self_attn.attn (the Attention layer) + inputs[0] = Q tensor: [batch, seq_len, num_heads, head_dim] + inputs[1] = K tensor: [batch, seq_len, num_kv_heads, head_dim] + inputs[2] = V tensor: [batch, seq_len, num_kv_heads, head_dim] + """ + if len(inputs) >= 3: + q, k, v = inputs[0], inputs[1], inputs[2] + print(f"\n[Layer {layer_id}] ATTENTION INPUT (pre-hook on self_attn.attn):") + print(f" Q shape: {q.shape}, dtype: {q.dtype}, mean: {q.float().mean():.4f}") + print(f" K shape: {k.shape}, dtype: {k.dtype}, mean: {k.float().mean():.4f}") + print(f" V shape: {v.shape}, dtype: {v.dtype}, mean: {v.float().mean():.4f}") + return None # Don't modify inputs + + def post_hook(module, inputs, output): + """ + Attention output hook - captures attention result. + + Position: decoder_layer.self_attn.attn (the Attention layer) + output = attention_output tensor: [batch, seq_len, num_heads * head_dim] + + NOTE: This is the output AFTER attention computation but BEFORE output projection. + """ + # output can be tensor or tuple depending on implementation + if isinstance(output, tuple): + attn_output = output[0] + else: + attn_output = output + print(f"\n[Layer {layer_id}] ATTENTION OUTPUT (post-hook on self_attn.attn):") + print(f" Output shape: {attn_output.shape}, dtype: {attn_output.dtype}") + print(f" Output mean: {attn_output.float().mean():.4f}, std: {attn_output.float().std():.4f}") + return None # Don't modify output + + return pre_hook if hook_type == "pre" else post_hook + + +# Main +llm = LLM( + MODEL_PATH, + enforce_eager=True, + max_model_len=MAX_MODEL_LEN, + max_num_batched_tokens=MAX_MODEL_LEN, + enable_cpu_offload=True, + kvcache_block_size=BLOCK_SIZE, + num_gpu_blocks=NUM_GPU_BLOCKS, + dtype="float16", +) + +# ============================================================ +# Register I/O hooks to inspect attention inputs/outputs +# ============================================================ +# Only enable for first 2 layers to avoid excessive output +io_hooks = [] +for layer_idx, decoder_layer in enumerate(llm.model_runner.model.model.layers): + if layer_idx >= 2: # Only first 2 layers + break + + # Position: decoder_layer.self_attn.attn (the Attention layer) + # - PRE hook sees: Q, K, V tensors (AFTER projection, AFTER RoPE) + # - POST hook sees: attention output (BEFORE output projection) + io_hooks.append(decoder_layer.self_attn.attn.register_forward_pre_hook( + make_attention_io_hook(layer_idx, "pre") + )) + io_hooks.append(decoder_layer.self_attn.attn.register_forward_hook( + make_attention_io_hook(layer_idx, "post") + )) + +prompt, expected = generate_needle_prompt( + tokenizer=llm.tokenizer, + target_length=INPUT_LEN, + needle_position=0.5, + needle_value="7492", + verbose=True, +) +outputs = llm.generate([prompt], SamplingParams(temperature=0.6, max_tokens=16), use_tqdm=False) + +for hook in io_hooks: + hook.remove() + +print("test_align: PASSED")