Spaces:
Sleeping
Sleeping
import gradio as gr | |
import spaces | |
import transformers | |
import torch | |
from transformers import AutoModelForCausalLM, AutoTokenizer, TextStreamer | |
model_id = "doubledsbv/Llama-3-Kafka-8B-v0.1" | |
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16) | |
tokenizer = transformers.AutoTokenizer.from_pretrained(model_id) | |
#streamer = TextStreamer(tokenizer) | |
pipeline = transformers.pipeline( | |
model=model, tokenizer=tokenizer, | |
return_full_text=True, | |
task='text-generation', | |
device="cuda", | |
) | |
def chat_function(message, history, system_prompt,max_new_tokens,temperature): | |
messages = [ | |
{"role": "system", "content": system_prompt}, | |
{"role": "user", "content": message}, | |
] | |
prompt = pipeline.tokenizer.apply_chat_template( | |
messages, | |
tokenize=False, | |
add_generation_prompt=True | |
) | |
terminators = [ | |
pipeline.tokenizer.eos_token_id, | |
pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>") | |
] | |
if temperature == 0: | |
temperature += 0.1 | |
outputs = pipeline( | |
prompt, | |
max_new_tokens=max_new_tokens, | |
num_beams=3, | |
num_return_sequences=1, | |
early_stopping=True, | |
eos_token_id=terminators, | |
do_sample=True, | |
temperature=temperature, | |
top_p=0.9, | |
#min_p=0.075, | |
#streamer=streamer | |
) | |
return outputs[0]["generated_text"][len(prompt):] | |
gr.ChatInterface( | |
chat_function, | |
chatbot=gr.Chatbot(height=500), | |
textbox=gr.Textbox(placeholder="Enter message here", container=False, scale=5), | |
title="Llama-3-Kafka-8B-v0.1", | |
description=""" | |
German-focused finetuned version of Llama-3-8B | |
""", | |
additional_inputs=[ | |
gr.Textbox("Du bist ein freundlicher KI-Assistent", label="System Prompt"), | |
gr.Slider(512, 8192, label="Max New Tokens"), | |
gr.Slider(0, 1, label="Temperature") | |
] | |
).launch() |