#!/usr/bin/env python3
import time
from vllm import LLM, SamplingParams
def main():
# Hard-coded model and tensor parallel configuration.
model_path = "miike-ai/qwen-14b-coder-fp8"
tensor_parallel_size = 1
# Define sampling parameters with an increased max_tokens and a stop string.
sampling_params = SamplingParams(
temperature=0.0,
top_p=0.95,
max_tokens=32000, # Increase this to allow longer responses.
stop=["\nUser:"], # Stop when the model outputs a new user marker.
)
print(f"Loading model '{model_path}' ...")
model = LLM(
model=model_path,
enforce_eager=True,
dtype="auto",
tensor_parallel_size=tensor_parallel_size,
)
print("Model loaded. You can now chat!")
print("Type 'exit' or 'quit' to end the conversation.\n")
conversation = ""
while True:
try:
user_input = input("User: ").strip()
except (KeyboardInterrupt, EOFError):
print("\nExiting chat.")
break
if user_input.lower() in {"exit", "quit"}:
print("Exiting chat.")
break
# Append the user's input to the conversation history.
conversation += f"User: {user_input}\nBot: "
print("Bot: ", end="", flush=True)
# Generate a response using the conversation history and sampling parameters.
response = model.generate(conversation, sampling_params=sampling_params)
# Extract the generated reply.
bot_reply = response[0].outputs[0].text.strip()
# Simulate streaming by printing one character at a time.
for char in bot_reply:
print(char, end="", flush=True)
time.sleep(0.02) # Adjust delay (in seconds) as desired.
print() # Newline after bot reply.
# Append the bot reply to conversation history.
conversation += bot_reply + "\n"
if __name__ == "__main__":
main()
- Downloads last month
- 33
Inference Providers
NEW
This model isn't deployed by any Inference Provider.
🙋
Ask for provider support
Model tree for miike-ai/qwen-14b-coder-fp8
Base model
Qwen/Qwen2.5-14B
Finetuned
Qwen/Qwen2.5-Coder-14B
Finetuned
Qwen/Qwen2.5-Coder-14B-Instruct