35 lines
949 B
Python
35 lines
949 B
Python
import os
|
|
from nanovllm import LLM, SamplingParams
|
|
from transformers import AutoTokenizer
|
|
|
|
|
|
def main():
|
|
path = os.path.expanduser("~/huggingface/Qwen3-0.6B/")
|
|
tokenizer = AutoTokenizer.from_pretrained(path)
|
|
llm = LLM(path, enforce_eager=True, tensor_parallel_size=1)
|
|
|
|
sampling_params = SamplingParams(temperature=0.6, max_tokens=256)
|
|
prompts = [
|
|
"introduce yourself",
|
|
"list all prime numbers within 100",
|
|
]
|
|
prompts = [
|
|
tokenizer.apply_chat_template(
|
|
[{"role": "user", "content": prompt}],
|
|
tokenize=False,
|
|
add_generation_prompt=True,
|
|
enable_thinking=True
|
|
)
|
|
for prompt in prompts
|
|
]
|
|
outputs = llm.generate(prompts, sampling_params)
|
|
|
|
for prompt, output in zip(prompts, outputs):
|
|
print("\n")
|
|
print(f"Prompt: {prompt!r}")
|
|
print(f"Completion: {output['text']!r}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|