[refactor] Refactor current gpu and cpu block allocation strategy.

This commit is contained in:
Zijie Tian
2025-12-10 21:23:31 +08:00
parent 0a247ccb1b
commit 190df5f70d
7 changed files with 906 additions and 162 deletions

View File

@@ -14,63 +14,66 @@ os.environ["NANOVLLM_LOG_LEVEL"] = "DEBUG"
from nanovllm import LLM, SamplingParams
def test_chunked_prefill(num_gpu_blocks=10, input_len=8192, output_len=16):
def create_long_context_prompt(target_tokens: int) -> str:
"""
Create a meaningful long context prompt with a question at the end.
The answer depends on information scattered throughout the context.
"""
# Key facts to embed in the context
facts = [
"The capital of France is Paris.",
"The Eiffel Tower was built in 1889.",
"Python was created by Guido van Rossum.",
"The speed of light is approximately 299,792 kilometers per second.",
"Mount Everest is 8,848 meters tall.",
]
# Padding text to reach target length
padding_paragraph = """
This is additional context information that helps extend the length of the prompt.
Machine learning has revolutionized many fields including computer vision, natural language processing, and robotics.
Deep neural networks can learn complex patterns from large amounts of data.
The transformer architecture has become the foundation of modern language models.
Attention mechanisms allow models to focus on relevant parts of the input.
"""
# Build the prompt
prompt_parts = []
# Add instruction
prompt_parts.append("Please read the following information carefully and answer the question at the end.\n\n")
# Add facts at different positions
current_tokens = 50 # approximate tokens so far
tokens_per_padding = 80 # approximate tokens per padding paragraph
fact_interval = target_tokens // (len(facts) + 1)
fact_idx = 0
while current_tokens < target_tokens - 100:
# Add padding
prompt_parts.append(padding_paragraph)
current_tokens += tokens_per_padding
# Add a fact at intervals
if fact_idx < len(facts) and current_tokens > fact_interval * (fact_idx + 1):
prompt_parts.append(f"\n[Important Fact #{fact_idx + 1}]: {facts[fact_idx]}\n")
current_tokens += 20
fact_idx += 1
# Add the question at the end
prompt_parts.append("\n\nQuestion: Based on the information above, what is the capital of France and when was the Eiffel Tower built? Please answer briefly.\n\nAnswer:")
return "".join(prompt_parts)
def test_chunked_prefill(num_gpu_blocks=10, input_len=8192, output_len=64):
"""Test chunked prefill with limited GPU blocks."""
path = os.path.expanduser("~/models/Qwen3-4B-Instruct-2507/")
total_blocks = (input_len + 255) // 256
print(f"=" * 60)
print(f"Chunked Prefill Test")
print(f"Chunked Prefill Test (Ping-Pong)")
print(f"=" * 60)
print(f" input_len: {input_len} tokens")
print(f" total_blocks: {total_blocks}")
print(f" num_gpu_blocks: {num_gpu_blocks}")
print(f" blocks_on_cpu: {max(0, total_blocks - num_gpu_blocks)}")
print()
llm = LLM(
path,
enforce_eager=False,
max_model_len=16 * 1024, # 16K is enough for 8K test
max_num_batched_tokens=16 * 1024,
enable_cpu_offload=True,
cpu_memory_gb=4.0,
num_gpu_blocks=num_gpu_blocks,
)
print(f"LLM initialized:")
print(f" num_gpu_kvcache_blocks: {llm.model_runner.config.num_gpu_kvcache_blocks}")
print(f" num_cpu_kvcache_blocks: {llm.model_runner.config.num_cpu_kvcache_blocks}")
print()
# Create prompt with approximate token count
prompt = "Hello " * (input_len // 2)
print(f"Running generation...")
outputs = llm.generate(
[prompt],
SamplingParams(temperature=0.6, max_tokens=output_len),
use_tqdm=True,
)
print()
print(f"Output tokens: {len(outputs[0]['token_ids'])}")
print(f"Output text (first 100 chars): {outputs[0]['text'][:100]}")
print()
return outputs
def test_chunked_decode(num_gpu_blocks=10, input_len=8192, output_len=64):
"""Test chunked decode with limited GPU blocks."""
path = os.path.expanduser("~/models/Qwen3-4B-Instruct-2507/")
total_blocks = (input_len + 255) // 256
print(f"=" * 60)
print(f"Chunked Decode Test")
print(f"=" * 60)
print(f" input_len: {input_len} tokens")
print(f" output_len: {output_len} tokens")
print(f" total_blocks: {total_blocks}")
print(f" target_input_len: ~{input_len} tokens")
print(f" num_gpu_blocks: {num_gpu_blocks}")
print()
@@ -80,27 +83,62 @@ def test_chunked_decode(num_gpu_blocks=10, input_len=8192, output_len=64):
max_model_len=16 * 1024,
max_num_batched_tokens=16 * 1024,
enable_cpu_offload=True,
cpu_memory_gb=4.0,
num_gpu_blocks=num_gpu_blocks,
)
print(f"LLM initialized:")
print(f" num_gpu_kvcache_blocks: {llm.model_runner.config.num_gpu_kvcache_blocks}")
print(f" num_cpu_kvcache_blocks: {llm.model_runner.config.num_cpu_kvcache_blocks}")
print()
prompt = "Hello " * (input_len // 2)
# Create meaningful prompt
prompt = create_long_context_prompt(input_len)
print(f"Running generation...")
outputs = llm.generate(
[prompt],
SamplingParams(temperature=0.6, max_tokens=output_len),
use_tqdm=True,
SamplingParams(temperature=0.1, max_tokens=output_len), # low temperature for more deterministic output
use_tqdm=False,
)
print()
print(f"Output tokens: {len(outputs[0]['token_ids'])}")
print(f"Output text (first 100 chars): {outputs[0]['text'][:100]}")
print(f"Output text:\n{outputs[0]['text']}")
print()
return outputs
def test_chunked_decode(num_gpu_blocks=10, input_len=8192, output_len=128):
"""Test chunked decode with limited GPU blocks."""
path = os.path.expanduser("~/models/Qwen3-4B-Instruct-2507/")
print(f"=" * 60)
print(f"Chunked Decode Test (Ping-Pong)")
print(f"=" * 60)
print(f" target_input_len: ~{input_len} tokens")
print(f" output_len: {output_len} tokens")
print(f" num_gpu_blocks: {num_gpu_blocks}")
print()
llm = LLM(
path,
enforce_eager=False,
max_model_len=16 * 1024,
max_num_batched_tokens=16 * 1024,
enable_cpu_offload=True,
num_gpu_blocks=num_gpu_blocks,
)
print()
# Create meaningful prompt
prompt = create_long_context_prompt(input_len)
print(f"Running generation...")
outputs = llm.generate(
[prompt],
SamplingParams(temperature=0.1, max_tokens=output_len),
use_tqdm=False,
)
print()
print(f"Output tokens: {len(outputs[0]['token_ids'])}")
print(f"Output text:\n{outputs[0]['text']}")
print()
return outputs
@@ -108,7 +146,7 @@ def test_chunked_decode(num_gpu_blocks=10, input_len=8192, output_len=64):
if __name__ == "__main__":
# Parse arguments
num_gpu_blocks = int(sys.argv[1]) if len(sys.argv) > 1 else 10
input_len = int(sys.argv[2]) if len(sys.argv) > 2 else 8192
output_len = int(sys.argv[3]) if len(sys.argv) > 3 else 32
input_len = int(sys.argv[2]) if len(sys.argv) > 2 else 2048
output_len = int(sys.argv[3]) if len(sys.argv) > 3 else 64
test_chunked_prefill(num_gpu_blocks, input_len, output_len)