nano-vllm/tests/test_chunked_attention.py

"""
Test chunked attention with small num_gpu_blocks to trigger CPU offload.

For 8K tokens with block_size=256:
- Total blocks needed: 8192 / 256 = 32 blocks
- With num_gpu_blocks=10, 22 blocks go to CPU -> triggers chunked attention
"""
import os
import sys

# Enable debug logging before importing nanovllm
os.environ["NANOVLLM_LOG_LEVEL"] = "DEBUG"

from nanovllm import LLM, SamplingParams


def create_long_context_prompt(target_tokens: int) -> str:
    """
    Create a meaningful long context prompt with a question at the end.
    The answer depends on information scattered throughout the context.
    """
    # Key facts to embed in the context
    facts = [
        "The capital of France is Paris.",
        "The Eiffel Tower was built in 1889.",
        "Python was created by Guido van Rossum.",
        "The speed of light is approximately 299,792 kilometers per second.",
        "Mount Everest is 8,848 meters tall.",
    ]

    # Padding text to reach target length
    padding_paragraph = """
This is additional context information that helps extend the length of the prompt.
Machine learning has revolutionized many fields including computer vision, natural language processing, and robotics.
Deep neural networks can learn complex patterns from large amounts of data.
The transformer architecture has become the foundation of modern language models.
Attention mechanisms allow models to focus on relevant parts of the input.
"""

    # Build the prompt
    prompt_parts = []

    # Add instruction
    prompt_parts.append("Please read the following information carefully and answer the question at the end.\n\n")

    # Add facts at different positions
    current_tokens = 50  # approximate tokens so far
    tokens_per_padding = 80  # approximate tokens per padding paragraph
    fact_interval = target_tokens // (len(facts) + 1)

    fact_idx = 0
    while current_tokens < target_tokens - 100:
        # Add padding
        prompt_parts.append(padding_paragraph)
        current_tokens += tokens_per_padding

        # Add a fact at intervals
        if fact_idx < len(facts) and current_tokens > fact_interval * (fact_idx + 1):
            prompt_parts.append(f"\n[Important Fact #{fact_idx + 1}]: {facts[fact_idx]}\n")
            current_tokens += 20
            fact_idx += 1

    # Add the question at the end
    prompt_parts.append("\n\nQuestion: Based on the information above, what is the speed of light?\n\nAnswer:")

    return "".join(prompt_parts)


def test_chunked_prefill(num_gpu_blocks=10, input_len=8192, output_len=64, num_prefetch_blocks=2):
    """Test chunked prefill with limited GPU blocks."""
    path = os.path.expanduser("~/models/Qwen3-4B-Instruct-2507/")

    print(f"=" * 60)
    print(f"Chunked Prefill Test (Chunked Offload)")
    print(f"=" * 60)
    print(f"  target_input_len: ~{input_len} tokens")
    print(f"  num_gpu_blocks: {num_gpu_blocks}")
    print(f"  num_prefetch_blocks: {num_prefetch_blocks}")
    print()

    llm = LLM(
        path,
        enforce_eager=False,
        max_model_len=128 * 1024,
        max_num_batched_tokens=128 * 1024,
        enable_cpu_offload=True,
        num_gpu_blocks=num_gpu_blocks,
        num_prefetch_blocks=num_prefetch_blocks,
    )
    print()

    # Create meaningful prompt
    prompt = create_long_context_prompt(input_len)

    print(f"Running generation...")
    outputs = llm.generate(
        [prompt],
        SamplingParams(temperature=0.1, max_tokens=output_len),  # low temperature for more deterministic output
        use_tqdm=False,
    )

    print()
    print(f"Output tokens: {len(outputs[0]['token_ids'])}")
    print(f"Output text:\n{outputs[0]['text']}")
    print()
    return outputs


def test_chunked_decode(num_gpu_blocks=10, input_len=8192, output_len=128, num_prefetch_blocks=2):
    """Test chunked decode with limited GPU blocks."""
    path = os.path.expanduser("~/models/Qwen3-4B-Instruct-2507/")

    print(f"=" * 60)
    print(f"Chunked Decode Test (Chunked Offload)")
    print(f"=" * 60)
    print(f"  target_input_len: ~{input_len} tokens")
    print(f"  output_len: {output_len} tokens")
    print(f"  num_gpu_blocks: {num_gpu_blocks}")
    print(f"  num_prefetch_blocks: {num_prefetch_blocks}")
    print()

    llm = LLM(
        path,
        enforce_eager=False,
        max_model_len=128 * 1024,
        max_num_batched_tokens=128 * 1024,
        enable_cpu_offload=True,
        num_gpu_blocks=num_gpu_blocks,
        num_prefetch_blocks=num_prefetch_blocks,
    )
    print()

    # Create meaningful prompt
    prompt = create_long_context_prompt(input_len)

    print(f"Running generation...")
    outputs = llm.generate(
        [prompt],
        SamplingParams(temperature=0.1, max_tokens=output_len),
        use_tqdm=False,
    )

    print()
    print(f"Output tokens: {len(outputs[0]['token_ids'])}")
    print(f"Output text:\n{outputs[0]['text']}")
    print()
    return outputs


if __name__ == "__main__":
    # Parse arguments: num_gpu_blocks input_len output_len [num_prefetch_blocks]
    num_gpu_blocks = int(sys.argv[1]) if len(sys.argv) > 1 else 10
    input_len = int(sys.argv[2]) if len(sys.argv) > 2 else 2048
    output_len = int(sys.argv[3]) if len(sys.argv) > 3 else 64
    num_prefetch_blocks = int(sys.argv[4]) if len(sys.argv) > 4 else 2

    test_chunked_prefill(num_gpu_blocks, input_len, output_len, num_prefetch_blocks)