Spaces:
Running
on
Zero
Running
on
Zero
File size: 3,353 Bytes
6f3f07e 3f2900f 8bd462e 3f2900f 8bd462e 2d26215 3f2900f cf082dc b87f04a ded7267 eb1daf1 b87f04a b6f59a7 6615fb0 b87f04a 6d09328 b87f04a a184d8d b87f04a 6615fb0 56d0515 6615fb0 a273772 6615fb0 8bd462e 6615fb0 b87f04a 6a866fa b87f04a 6615fb0 b87f04a 6615fb0 b87f04a 87a4475 bfe628d b87f04a a184d8d 634313c b011b3b 3f2900f 80f2b5c acc3ae4 3f2900f 6615fb0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 |
import spaces
import gradio as gr
from transformers import pipeline, AutoTokenizer, TextIteratorStreamer
import torch
from threading import Thread
import os
@spaces.GPU()
def load_model(model_name):
return pipeline("text-generation", model=model_name, device_map="cuda", torch_dtype=torch.bfloat16, trust_remote_code=True, token=os.environ["token"], use_fast=True)
@spaces.GPU(duration=45)
def generate(
message,
history,
model_name,
system,
temperature=0.4,
top_p=0.95,
min_p=0.1,
top_k=50,
max_new_tokens=256,
):
try:
pipe = load_model(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, token=os.environ["token"])
tokenizer.eos_token = "<|im_end|>"
print(tokenizer)
pipe.tokenizer = tokenizer
prompt = f"<|im_start|>system\n{system}<|im_end|>\n"
for (user_turn, assistant_turn) in history:
prompt += f"<|im_start|>user\n{user_turn}<|im_end|>\n<|im_start|>assistant\n{assistant_turn}<|im_end|>\n"
prompt += f"<|im_start|>user\n{message}<|im_end|>\n<|im_start|>assistant\n"
streamer = TextIteratorStreamer(pipe.tokenizer, timeout=240.0, skip_prompt=True, skip_special_tokens=True)
generation_kwargs = dict(
text_inputs=prompt,
streamer=streamer,
max_new_tokens=max_new_tokens,
do_sample=True,
top_p=top_p,
min_p=min_p,
top_k=top_k,
temperature=temperature,
num_beams=1,
repetition_penalty=1.1
)
t = Thread(target=pipe.__call__, kwargs=generation_kwargs)
t.start()
outputs = []
for chunk in streamer:
outputs.append(chunk)
yield "".join(outputs)
except StopAsyncIteration:
print("Stream stopped unexpectedly.")
yield "".join(outputs)
except Exception as e:
print(f"An error occurred: {e}")
yield "An error occurred during generation."
model_choices = ["Locutusque/Apollo-2.0-Llama-3.1-8B", "Locutusque/TinyMistral-248M-v3", "Locutusque/Hercules-6.2-Llama-3.1-8B", "Locutusque/DareQwen-2.5-7B", "M4-ai/TinyMistral-248M-V3-Instruct", "Locutusque/StockQwen-2.5-7B"]
# What at the best options?
g = gr.ChatInterface(
fn=generate,
additional_inputs=[
gr.components.Dropdown(choices=model_choices, label="Model", value=model_choices[0], interactive=True),
gr.components.Textbox(lines=2, label="System Prompt", value="You are a sentient, super intelligent AI developed by a man named Locutusque."),
gr.components.Slider(minimum=0, maximum=2, value=0.8, label="Temperature"),
gr.components.Slider(minimum=0, maximum=1, value=0.95, label="Top p"),
gr.components.Slider(minimum=0, maximum=1, value=0.1, label="Min P"),
gr.components.Slider(minimum=0, maximum=100, step=1, value=15, label="Top k"),
gr.components.Slider(minimum=1, maximum=8192, step=1, value=1024, label="Max tokens"),
],
title="Locutusque's Language Models",
description="Try out Locutusque's language models here! Credit goes to Mediocreatmybest for this space. You may also find some experimental preview models that have not been made public here.",
)
if __name__ == "__main__":
g.launch()
|