Spaces:
Running
on
Zero
Running
on
Zero
File size: 2,712 Bytes
3f2900f 8bd462e 3f2900f 3d0fb66 8bd462e 2d26215 3f2900f 3d0fb66 b87f04a f692b0d 888022c b87f04a a184d8d b87f04a 5545870 f80709d 9a9d276 fc61ac0 a184d8d 724a6c2 8bd462e b87f04a 3a3e203 b87f04a 3f2900f b87f04a 760514e 634313c b87f04a a184d8d 634313c ad9f68c 3f2900f b87f04a 80f2b5c acc3ae4 b87f04a 3f2900f c7e5485 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 |
import gradio as gr
from transformers import pipeline, AutoTokenizer, TextIteratorStreamer
import torch
import spaces
from threading import Thread
import os
@spaces.GPU
def load_model(model_name):
return pipeline("text-generation", model=model_name, device_map="cuda", torch_dtype=torch.bfloat16, trust_remote_code=True, token=os.environ["token"])
@spaces.GPU()
def generate(
model_name,
user_input,
temperature=0.4,
top_p=0.95,
min_p=0.1,
top_k=50,
max_new_tokens=256,
):
pipe = load_model(model_name)
# Set tokenize correctly. Otherwise ticking the box breaks it.
if model_name == "M4-ai/tau-1.8B":
prompt = user_input
else:
prompt = f"<|im_start|>user\n{user_input}<|im_end|>\n<|im_start|>assistant\n"
streamer = TextIteratorStreamer(pipe.tokenizer, timeout=240.0, skip_prompt=True, skip_special_tokens=True)
generation_kwargs = dict(text_inputs=prompt, streamer=streamer, max_new_tokens=max_new_tokens, do_sample=True, top_p=top_p, min_p=min_p, top_k=top_k,
temperature=temperature, num_beams=1, repetition_penalty=1.1)
t = Thread(target=pipe.__call__, kwargs=generation_kwargs)
t.start()
outputs = []
for chunk in streamer:
outputs.append(chunk)
yield "".join(outputs)
model_choices = ["Locutusque/llama-3-neural-chat-v2.2-8b", "Locutusque/Llama-3-Yggdrasil-2.0-8B", "Locutusque/Llama-3-NeuralYggdrasil-8B", "M4-ai/tau-1.8B", "Locutusque/Llama-3-NeuralHercules-5.0-8B", "QuasarResearch/Llama-3-OpenCerebrum-2.0-SFT-Optimized", "Locutusque/Llama-3-Hercules-5.0-8B"]
# What at the best options?
g = gr.Interface(
fn=generate,
inputs=[
gr.components.Dropdown(choices=model_choices, label="Model", value=model_choices[0], interactive=True),
gr.components.Textbox(lines=2, label="Prompt", value="Write me a Python program that calculates the factorial of a given number."),
gr.components.Slider(minimum=0, maximum=1, value=0.8, label="Temperature"),
gr.components.Slider(minimum=0, maximum=1, value=0.95, label="Top p"),
gr.components.Slider(minimum=0, maximum=1, value=0.1, label="Min P"),
gr.components.Slider(minimum=0, maximum=100, step=1, value=15, label="Top k"),
gr.components.Slider(minimum=1, maximum=2048, step=1, value=1024, label="Max tokens"),
],
outputs=[gr.Textbox(lines=10, label="Output")],
title="Locutusque's Language Models",
description="Try out Locutusque's language models here! Credit goes to Mediocreatmybest for this space. You may also find some experimental preview models that have not been made public here.",
concurrency_limit=1
)
g.launch(max_threads=4)
|