import os import time import torch from nanovllm import LLM, SamplingParams batch_size = 256 seq_len = 1024 max_tokens = 512 path = os.path.expanduser("~/huggingface/Qwen3-0.6B/") llm = LLM(path, enforce_eager=False) prompt_token_ids = torch.randint(0, 10240, (batch_size, seq_len)).tolist() sampling_params = SamplingParams(temperature=0.6, ignore_eos=True, max_tokens=max_tokens) t = time.time() completions = llm.generate(prompt_token_ids, sampling_params) throughput = batch_size * max_tokens / (time.time() - t) print(f"Throughput: {throughput: .2f}")