30 lines
782 B
Python
30 lines
782 B
Python
import os
|
|
from nanovllm import LLM, SamplingParams
|
|
from transformers import AutoTokenizer
|
|
|
|
|
|
path = os.path.expanduser("~/huggingface/Qwen3-0.6B/")
|
|
tokenizer = AutoTokenizer.from_pretrained(path)
|
|
llm = LLM(path, enforce_eager=True)
|
|
|
|
sampling_params = SamplingParams(temperature=0.6, max_tokens=256)
|
|
prompts = [
|
|
"自我介绍一下吧!",
|
|
"列出100内所有素数",
|
|
]
|
|
prompts = [
|
|
tokenizer.apply_chat_template(
|
|
[{"role": "user", "content": prompt}],
|
|
tokenize=False,
|
|
add_generation_prompt=True,
|
|
enable_thinking=True
|
|
)
|
|
for prompt in prompts
|
|
]
|
|
outputs = llm.generate(prompts, sampling_params)
|
|
|
|
for prompt, output in zip(prompts, outputs):
|
|
print("\n")
|
|
print(f"Prompt: {prompt!r}")
|
|
print(f"Completion: {output['text']!r}")
|