[refactor] Refactor current gpu and cpu block allocation strategy.

2025-12-10 21:23:31 +08:00
parent 0a247ccb1b
commit 190df5f70d
7 changed files with 906 additions and 162 deletions
--- a/tests/test_chunked_attention.py
+++ b/tests/test_chunked_attention.py
@@ -14,63 +14,66 @@ os.environ["NANOVLLM_LOG_LEVEL"] = "DEBUG"
 from nanovllm import LLM, SamplingParams


-def test_chunked_prefill(num_gpu_blocks=10, input_len=8192, output_len=16):
+def create_long_context_prompt(target_tokens: int) -> str:
+    """
+    Create a meaningful long context prompt with a question at the end.
+    The answer depends on information scattered throughout the context.
+    """
+    # Key facts to embed in the context
+    facts = [
+        "The capital of France is Paris.",
+        "The Eiffel Tower was built in 1889.",
+        "Python was created by Guido van Rossum.",
+        "The speed of light is approximately 299,792 kilometers per second.",
+        "Mount Everest is 8,848 meters tall.",
+    ]
+
+    # Padding text to reach target length
+    padding_paragraph = """
+This is additional context information that helps extend the length of the prompt.
+Machine learning has revolutionized many fields including computer vision, natural language processing, and robotics.
+Deep neural networks can learn complex patterns from large amounts of data.
+The transformer architecture has become the foundation of modern language models.
+Attention mechanisms allow models to focus on relevant parts of the input.
+"""
+
+    # Build the prompt
+    prompt_parts = []
+
+    # Add instruction
+    prompt_parts.append("Please read the following information carefully and answer the question at the end.\n\n")
+
+    # Add facts at different positions
+    current_tokens = 50  # approximate tokens so far
+    tokens_per_padding = 80  # approximate tokens per padding paragraph
+    fact_interval = target_tokens // (len(facts) + 1)
+
+    fact_idx = 0
+    while current_tokens < target_tokens - 100:
+        # Add padding
+        prompt_parts.append(padding_paragraph)
+        current_tokens += tokens_per_padding
+
+        # Add a fact at intervals
+        if fact_idx < len(facts) and current_tokens > fact_interval * (fact_idx + 1):
+            prompt_parts.append(f"\n[Important Fact #{fact_idx + 1}]: {facts[fact_idx]}\n")
+            current_tokens += 20
+            fact_idx += 1
+
+    # Add the question at the end
+    prompt_parts.append("\n\nQuestion: Based on the information above, what is the capital of France and when was the Eiffel Tower built? Please answer briefly.\n\nAnswer:")
+
+    return "".join(prompt_parts)
+
+
+def test_chunked_prefill(num_gpu_blocks=10, input_len=8192, output_len=64):
    """Test chunked prefill with limited GPU blocks."""
    path = os.path.expanduser("~/models/Qwen3-4B-Instruct-2507/")

-    total_blocks = (input_len + 255) // 256
    print(f"=" * 60)
-    print(f"Chunked Prefill Test")
+    print(f"Chunked Prefill Test (Ping-Pong)")
    print(f"=" * 60)
-    print(f"  input_len: {input_len} tokens")
-    print(f"  total_blocks: {total_blocks}")
-    print(f"  num_gpu_blocks: {num_gpu_blocks}")
-    print(f"  blocks_on_cpu: {max(0, total_blocks - num_gpu_blocks)}")
-    print()
-
-    llm = LLM(
-        path,
-        enforce_eager=False,
-        max_model_len=16 * 1024,  # 16K is enough for 8K test
-        max_num_batched_tokens=16 * 1024,
-        enable_cpu_offload=True,
-        cpu_memory_gb=4.0,
-        num_gpu_blocks=num_gpu_blocks,
-    )
-
-    print(f"LLM initialized:")
-    print(f"  num_gpu_kvcache_blocks: {llm.model_runner.config.num_gpu_kvcache_blocks}")
-    print(f"  num_cpu_kvcache_blocks: {llm.model_runner.config.num_cpu_kvcache_blocks}")
-    print()
-
-    # Create prompt with approximate token count
-    prompt = "Hello " * (input_len // 2)
-
-    print(f"Running generation...")
-    outputs = llm.generate(
-        [prompt],
-        SamplingParams(temperature=0.6, max_tokens=output_len),
-        use_tqdm=True,
-    )
-
-    print()
-    print(f"Output tokens: {len(outputs[0]['token_ids'])}")
-    print(f"Output text (first 100 chars): {outputs[0]['text'][:100]}")
-    print()
-    return outputs
-
-
-def test_chunked_decode(num_gpu_blocks=10, input_len=8192, output_len=64):
-    """Test chunked decode with limited GPU blocks."""
-    path = os.path.expanduser("~/models/Qwen3-4B-Instruct-2507/")
-
-    total_blocks = (input_len + 255) // 256
-    print(f"=" * 60)
-    print(f"Chunked Decode Test")
-    print(f"=" * 60)
-    print(f"  input_len: {input_len} tokens")
-    print(f"  output_len: {output_len} tokens")
-    print(f"  total_blocks: {total_blocks}")
+    print(f"  target_input_len: ~{input_len} tokens")
    print(f"  num_gpu_blocks: {num_gpu_blocks}")
    print()

@@ -80,27 +83,62 @@ def test_chunked_decode(num_gpu_blocks=10, input_len=8192, output_len=64):
        max_model_len=16 * 1024,
        max_num_batched_tokens=16 * 1024,
        enable_cpu_offload=True,
-        cpu_memory_gb=4.0,
        num_gpu_blocks=num_gpu_blocks,
    )
-
-    print(f"LLM initialized:")
-    print(f"  num_gpu_kvcache_blocks: {llm.model_runner.config.num_gpu_kvcache_blocks}")
-    print(f"  num_cpu_kvcache_blocks: {llm.model_runner.config.num_cpu_kvcache_blocks}")
    print()

-    prompt = "Hello " * (input_len // 2)
+    # Create meaningful prompt
+    prompt = create_long_context_prompt(input_len)

    print(f"Running generation...")
    outputs = llm.generate(
        [prompt],
-        SamplingParams(temperature=0.6, max_tokens=output_len),
-        use_tqdm=True,
+        SamplingParams(temperature=0.1, max_tokens=output_len),  # low temperature for more deterministic output
+        use_tqdm=False,
    )

    print()
    print(f"Output tokens: {len(outputs[0]['token_ids'])}")
-    print(f"Output text (first 100 chars): {outputs[0]['text'][:100]}")
+    print(f"Output text:\n{outputs[0]['text']}")
+    print()
+    return outputs
+
+
+def test_chunked_decode(num_gpu_blocks=10, input_len=8192, output_len=128):
+    """Test chunked decode with limited GPU blocks."""
+    path = os.path.expanduser("~/models/Qwen3-4B-Instruct-2507/")
+
+    print(f"=" * 60)
+    print(f"Chunked Decode Test (Ping-Pong)")
+    print(f"=" * 60)
+    print(f"  target_input_len: ~{input_len} tokens")
+    print(f"  output_len: {output_len} tokens")
+    print(f"  num_gpu_blocks: {num_gpu_blocks}")
+    print()
+
+    llm = LLM(
+        path,
+        enforce_eager=False,
+        max_model_len=16 * 1024,
+        max_num_batched_tokens=16 * 1024,
+        enable_cpu_offload=True,
+        num_gpu_blocks=num_gpu_blocks,
+    )
+    print()
+
+    # Create meaningful prompt
+    prompt = create_long_context_prompt(input_len)
+
+    print(f"Running generation...")
+    outputs = llm.generate(
+        [prompt],
+        SamplingParams(temperature=0.1, max_tokens=output_len),
+        use_tqdm=False,
+    )
+
+    print()
+    print(f"Output tokens: {len(outputs[0]['token_ids'])}")
+    print(f"Output text:\n{outputs[0]['text']}")
    print()
    return outputs

@@ -108,7 +146,7 @@ def test_chunked_decode(num_gpu_blocks=10, input_len=8192, output_len=64):
 if __name__ == "__main__":
    # Parse arguments
    num_gpu_blocks = int(sys.argv[1]) if len(sys.argv) > 1 else 10
-    input_len = int(sys.argv[2]) if len(sys.argv) > 2 else 8192
-    output_len = int(sys.argv[3]) if len(sys.argv) > 3 else 32
+    input_len = int(sys.argv[2]) if len(sys.argv) > 2 else 2048
+    output_len = int(sys.argv[3]) if len(sys.argv) > 3 else 64

    test_chunked_prefill(num_gpu_blocks, input_len, output_len)