Spaces:
Runtime error
Runtime error
File size: 8,982 Bytes
c744c83 a159605 c44cc92 a159605 bfaccfa 109898e 039ca7e bfaccfa 039ca7e bfaccfa a159605 bfaccfa 109898e bfaccfa a159605 c744c83 a159605 bfaccfa c744c83 a159605 bfaccfa 109898e bfaccfa 109898e 31917a3 a159605 109898e bfaccfa a159605 039ca7e 109898e bfaccfa a159605 5c01458 a159605 5c01458 c744c83 a159605 c744c83 a159605 5c01458 a159605 5c01458 a159605 5c01458 a159605 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 |
import gradio as gr
import spaces
import torch
import transformers
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
# model_name = "meta-llama/Meta-Llama-3-8B-Instruct"
model_name = "mistralai/Mistral-7B-Instruct-v0.2"
pipeline = transformers.pipeline(
"text-generation",
model=model_name,
model_kwargs={"torch_dtype": torch.bfloat16},
device="cpu",
)
def chat_function(message, history, system_prompt, max_new_tokens, temperature):
messages = []
# Check if history is None or empty and handle accordingly
if history:
for user_msg, assistant_msg in history:
messages.append({"role": "user", "content": user_msg})
messages.append({"role": "assistant", "content": assistant_msg})
# Always add the current user message
messages.append({"role": "user", "content": message})
# Construct the prompt using the pipeline's tokenizer
prompt = pipeline.tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True
)
# Generate the response
terminators = [
pipeline.tokenizer.eos_token_id,
pipeline.tokenizer.convert_tokens_to_ids("")
]
# Adjust the temperature slightly above given to ensure variety
adjusted_temp = temperature + 0.1
# Generate outputs with adjusted parameters
outputs = pipeline(
prompt,
max_new_tokens=max_new_tokens,
do_sample=True,
temperature=adjusted_temp,
top_p=0.9
)
# Extract the generated text, skipping the length of the prompt
generated_text = outputs[0]["generated_text"]
return generated_text[len(prompt):] # Return the new part of the conversation
# Update Gradio interface setup
gr.Interface(
fn=chat_function,
inputs=[
gr.Textbox(placeholder="Enter your message here", label="Your Message"),
gr.JSON(label="Conversation History (format as [[user, assistant], ...])"), # Without optional
gr.Textbox(label="System Prompt"),
gr.Slider(512, 4096, label="Max New Tokens"),
gr.Slider(0.0, 1.0, step=0.1, label="Temperature")
],
outputs=gr.Textbox(label="AI Response")
).launch()
# def chat_function(message, history, system_prompt,max_new_tokens,temperature):
# messages = [
# {"role": "system", "content": system_prompt},
# {"role": "user", "content": message},
# ]
# prompt = pipeline.tokenizer.apply_chat_template(
# messages,
# tokenize=False,
# add_generation_prompt=True
# )
# terminators = [
# pipeline.tokenizer.eos_token_id,
# pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>")
# ]
# temp = temperature + 0.1
# outputs = pipeline(
# prompt,
# max_new_tokens=max_new_tokens,
# eos_token_id=terminators,
# do_sample=True,
# temperature=temp,
# top_p=0.9,
# )
# return outputs[0]["generated_text"][len(prompt):]
# gr.ChatInterface(
# chat_function,
# chatbot=gr.Chatbot(height=400),
# textbox=gr.Textbox(placeholder="Enter message here", container=False, scale=7),
# title="Meta-Llama-3-8B-Instruct",
# description="""
# To Learn about Fine-tuning Llama-3-8B, Ckeck https://exnrt.com/blog/ai/finetune-llama3-8b/.
# """,
# additional_inputs=[
# gr.Textbox("You are helpful AI.", label="System Prompt"),
# gr.Slider(512, 4096, label="Max New Tokens"),
# gr.Slider(0, 1, label="Temperature")
# ]
# ).launch()
#The Code
# import gradio as gr
# import os
# import spaces
# from transformers import GemmaTokenizer, AutoModelForCausalLM
# from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
# from threading import Thread
# # Set an environment variable
# HF_TOKEN = os.environ.get("HF_TOKEN", None)
# DESCRIPTION = '''
# <div>
# <h1 style="text-align: center;">Meta Llama3 8B</h1>
# <p>This Space demonstrates the instruction-tuned model <a href="https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct"><b>Meta Llama3 8b Chat</b></a>. Meta Llama3 is the new open LLM and comes in two sizes: 8b and 70b. Feel free to play with it, or duplicate to run privately!</p>
# <p>🔎 For more details about the Llama3 release and how to use the model with <code>transformers</code>, take a look <a href="https://huggingface.co/blog/llama3">at our blog post</a>.</p>
# <p>🦕 Looking for an even more powerful model? Check out the <a href="https://huggingface.co/chat/"><b>Hugging Chat</b></a> integration for Meta Llama 3 70b</p>
# </div>
# '''
# LICENSE = """
# <p/>
# ---
# Built with Meta Llama 3
# """
# PLACEHOLDER = """
# <div style="padding: 30px; text-align: center; display: flex; flex-direction: column; align-items: center;">
# <img src="https://ysharma-dummy-chat-app.hf.space/file=/tmp/gradio/8e75e61cc9bab22b7ce3dec85ab0e6db1da5d107/Meta_lockup_positive%20primary_RGB.jpg" style="width: 80%; max-width: 550px; height: auto; opacity: 0.55; ">
# <h1 style="font-size: 28px; margin-bottom: 2px; opacity: 0.55;">Meta llama3</h1>
# <p style="font-size: 18px; margin-bottom: 2px; opacity: 0.65;">Ask me anything...</p>
# </div>
# """
# css = """
# h1 {
# text-align: center;
# display: block;
# }
# #duplicate-button {
# margin: auto;
# color: white;
# background: #1565c0;
# border-radius: 100vh;
# }
# """
# # Load the tokenizer and model
# tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct")
# model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct", device_map="auto") # to("cuda:0")
# terminators = [
# tokenizer.eos_token_id,
# tokenizer.convert_tokens_to_ids("<|eot_id|>")
# ]
# @spaces.GPU(duration=120)
# def chat_llama3_8b(message: str,
# history: list,
# temperature: float,
# max_new_tokens: int
# ) -> str:
# """
# Generate a streaming response using the llama3-8b model.
# Args:
# message (str): The input message.
# history (list): The conversation history used by ChatInterface.
# temperature (float): The temperature for generating the response.
# max_new_tokens (int): The maximum number of new tokens to generate.
# Returns:
# str: The generated response.
# """
# conversation = []
# for user, assistant in history:
# conversation.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}])
# conversation.append({"role": "user", "content": message})
# input_ids = tokenizer.apply_chat_template(conversation, return_tensors="pt").to(model.device)
# streamer = TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
# generate_kwargs = dict(
# input_ids= input_ids,
# streamer=streamer,
# max_new_tokens=max_new_tokens,
# do_sample=True,
# temperature=temperature,
# eos_token_id=terminators,
# )
# # This will enforce greedy generation (do_sample=False) when the temperature is passed 0, avoiding the crash.
# if temperature == 0:
# generate_kwargs['do_sample'] = False
# t = Thread(target=model.generate, kwargs=generate_kwargs)
# t.start()
# outputs = []
# for text in streamer:
# outputs.append(text)
# print(outputs)
# yield "".join(outputs)
# # Gradio block
# chatbot=gr.Chatbot(height=450, placeholder=PLACEHOLDER, label='Gradio ChatInterface')
# with gr.Blocks(fill_height=True, css=css) as demo:
# gr.Markdown(DESCRIPTION)
# gr.DuplicateButton(value="Duplicate Space for private use", elem_id="duplicate-button")
# gr.ChatInterface(
# fn=chat_llama3_8b,
# chatbot=chatbot,
# fill_height=True,
# additional_inputs_accordion=gr.Accordion(label="⚙️ Parameters", open=False, render=False),
# additional_inputs=[
# gr.Slider(minimum=0,
# maximum=1,
# step=0.1,
# value=0.95,
# label="Temperature",
# render=False),
# gr.Slider(minimum=128,
# maximum=4096,
# step=1,
# value=512,
# label="Max new tokens",
# render=False ),
# ],
# examples=[
# ['How to setup a human base on Mars? Give short answer.'],
# ['Explain theory of relativity to me like I’m 8 years old.'],
# ['What is 9,000 * 9,000?'],
# ['Write a pun-filled happy birthday message to my friend Alex.']
# ],
# cache_examples=False,
# )
# gr.Markdown(LICENSE)
# if __name__ == "__main__":
# demo.launch() |