File size: 8,982 Bytes
c744c83
 
a159605
 
 
 
 
 
c44cc92
 
a159605
 
 
 
 
 
 
bfaccfa
 
109898e
 
039ca7e
 
 
bfaccfa
039ca7e
bfaccfa
 
 
a159605
 
 
 
 
bfaccfa
109898e
 
 
 
 
 
bfaccfa
 
 
 
a159605
 
c744c83
a159605
bfaccfa
 
c744c83
a159605
bfaccfa
 
 
 
109898e
bfaccfa
 
 
 
109898e
31917a3
a159605
109898e
bfaccfa
 
a159605
 
 
039ca7e
109898e
bfaccfa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a159605
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5c01458
a159605
 
 
 
 
 
 
 
5c01458
c744c83
a159605
 
c744c83
a159605
5c01458
a159605
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5c01458
a159605
5c01458
a159605
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
import gradio as gr
import spaces
import torch

import transformers
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

# model_name = "meta-llama/Meta-Llama-3-8B-Instruct"
model_name = "mistralai/Mistral-7B-Instruct-v0.2"
pipeline = transformers.pipeline(
    "text-generation",
    model=model_name,
    model_kwargs={"torch_dtype": torch.bfloat16},
    device="cpu",
)

def chat_function(message, history, system_prompt, max_new_tokens, temperature):
    messages = []
    # Check if history is None or empty and handle accordingly
    if history:
        for user_msg, assistant_msg in history:
            messages.append({"role": "user", "content": user_msg})
            messages.append({"role": "assistant", "content": assistant_msg})
    
    # Always add the current user message
    messages.append({"role": "user", "content": message})
    
    # Construct the prompt using the pipeline's tokenizer
    prompt = pipeline.tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )

    # Generate the response
    terminators = [
        pipeline.tokenizer.eos_token_id,
        pipeline.tokenizer.convert_tokens_to_ids("")
    ]

    # Adjust the temperature slightly above given to ensure variety
    adjusted_temp = temperature + 0.1

    # Generate outputs with adjusted parameters
    outputs = pipeline(
        prompt,
        max_new_tokens=max_new_tokens,
        do_sample=True,
        temperature=adjusted_temp,
        top_p=0.9
    )

    # Extract the generated text, skipping the length of the prompt
    generated_text = outputs[0]["generated_text"]
    return generated_text[len(prompt):]  # Return the new part of the conversation

# Update Gradio interface setup
gr.Interface(
    fn=chat_function,
    inputs=[
        gr.Textbox(placeholder="Enter your message here", label="Your Message"),
        gr.JSON(label="Conversation History (format as [[user, assistant], ...])"),  # Without optional
        gr.Textbox(label="System Prompt"),
        gr.Slider(512, 4096, label="Max New Tokens"),
        gr.Slider(0.0, 1.0, step=0.1, label="Temperature")
    ],
    outputs=gr.Textbox(label="AI Response")
).launch()




# def chat_function(message, history, system_prompt,max_new_tokens,temperature):
#     messages = [
#         {"role": "system", "content": system_prompt},
#         {"role": "user", "content": message},
#     ]
#     prompt = pipeline.tokenizer.apply_chat_template(
#         messages,
#         tokenize=False,
#         add_generation_prompt=True
#     )
#     terminators = [
#         pipeline.tokenizer.eos_token_id,
#         pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>")
#     ]
#     temp = temperature + 0.1
#     outputs = pipeline(
#         prompt,
#         max_new_tokens=max_new_tokens,
#         eos_token_id=terminators,
#         do_sample=True,
#         temperature=temp,
#         top_p=0.9,
#     )
#     return outputs[0]["generated_text"][len(prompt):]

# gr.ChatInterface(
#     chat_function,
#     chatbot=gr.Chatbot(height=400),
#     textbox=gr.Textbox(placeholder="Enter message here", container=False, scale=7),
#     title="Meta-Llama-3-8B-Instruct",
#     description="""
#     To Learn about Fine-tuning Llama-3-8B, Ckeck https://exnrt.com/blog/ai/finetune-llama3-8b/.
#     """,
#     additional_inputs=[
#         gr.Textbox("You are helpful AI.", label="System Prompt"),
#         gr.Slider(512, 4096, label="Max New Tokens"),
#         gr.Slider(0, 1, label="Temperature")
#     ]
# ).launch()


#The Code

# import gradio as gr
# import os
# import spaces
# from transformers import GemmaTokenizer, AutoModelForCausalLM
# from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
# from threading import Thread

# # Set an environment variable
# HF_TOKEN = os.environ.get("HF_TOKEN", None)


# DESCRIPTION = '''
# <div>
# <h1 style="text-align: center;">Meta Llama3 8B</h1>
# <p>This Space demonstrates the instruction-tuned model <a href="https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct"><b>Meta Llama3 8b Chat</b></a>. Meta Llama3 is the new open LLM and comes in two sizes: 8b and 70b. Feel free to play with it, or duplicate to run privately!</p>
# <p>🔎 For more details about the Llama3 release and how to use the model with <code>transformers</code>, take a look <a href="https://huggingface.co/blog/llama3">at our blog post</a>.</p>
# <p>🦕 Looking for an even more powerful model? Check out the <a href="https://huggingface.co/chat/"><b>Hugging Chat</b></a> integration for Meta Llama 3 70b</p>
# </div>
# '''

# LICENSE = """
# <p/>
# ---
# Built with Meta Llama 3
# """

# PLACEHOLDER = """
# <div style="padding: 30px; text-align: center; display: flex; flex-direction: column; align-items: center;">
#    <img src="https://ysharma-dummy-chat-app.hf.space/file=/tmp/gradio/8e75e61cc9bab22b7ce3dec85ab0e6db1da5d107/Meta_lockup_positive%20primary_RGB.jpg" style="width: 80%; max-width: 550px; height: auto; opacity: 0.55;  "> 
#    <h1 style="font-size: 28px; margin-bottom: 2px; opacity: 0.55;">Meta llama3</h1>
#    <p style="font-size: 18px; margin-bottom: 2px; opacity: 0.65;">Ask me anything...</p>
# </div>
# """


# css = """
# h1 {
#   text-align: center;
#   display: block;
# }
# #duplicate-button {
#   margin: auto;
#   color: white;
#   background: #1565c0;
#   border-radius: 100vh;
# }
# """

# # Load the tokenizer and model
# tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct")
# model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct", device_map="auto")  # to("cuda:0") 
# terminators = [
#     tokenizer.eos_token_id,
#     tokenizer.convert_tokens_to_ids("<|eot_id|>")
# ]

# @spaces.GPU(duration=120)
# def chat_llama3_8b(message: str, 
#               history: list, 
#               temperature: float, 
#               max_new_tokens: int
#              ) -> str:
#     """
#     Generate a streaming response using the llama3-8b model.
#     Args:
#         message (str): The input message.
#         history (list): The conversation history used by ChatInterface.
#         temperature (float): The temperature for generating the response.
#         max_new_tokens (int): The maximum number of new tokens to generate.
#     Returns:
#         str: The generated response.
#     """
#     conversation = []
#     for user, assistant in history:
#         conversation.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}])
#     conversation.append({"role": "user", "content": message})

#     input_ids = tokenizer.apply_chat_template(conversation, return_tensors="pt").to(model.device)
    
#     streamer = TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)

#     generate_kwargs = dict(
#         input_ids= input_ids,
#         streamer=streamer,
#         max_new_tokens=max_new_tokens,
#         do_sample=True,
#         temperature=temperature,
#         eos_token_id=terminators,
#     )
#     # This will enforce greedy generation (do_sample=False) when the temperature is passed 0, avoiding the crash.             
#     if temperature == 0:
#         generate_kwargs['do_sample'] = False
        
#     t = Thread(target=model.generate, kwargs=generate_kwargs)
#     t.start()

#     outputs = []
#     for text in streamer:
#         outputs.append(text)
#         print(outputs)
#         yield "".join(outputs)
        

# # Gradio block
# chatbot=gr.Chatbot(height=450, placeholder=PLACEHOLDER, label='Gradio ChatInterface')

# with gr.Blocks(fill_height=True, css=css) as demo:
    
#     gr.Markdown(DESCRIPTION)
#     gr.DuplicateButton(value="Duplicate Space for private use", elem_id="duplicate-button")
#     gr.ChatInterface(
#         fn=chat_llama3_8b,
#         chatbot=chatbot,
#         fill_height=True,
#         additional_inputs_accordion=gr.Accordion(label="⚙️ Parameters", open=False, render=False),
#         additional_inputs=[
#             gr.Slider(minimum=0,
#                       maximum=1, 
#                       step=0.1,
#                       value=0.95, 
#                       label="Temperature", 
#                       render=False),
#             gr.Slider(minimum=128, 
#                       maximum=4096,
#                       step=1,
#                       value=512, 
#                       label="Max new tokens", 
#                       render=False ),
#             ],
#         examples=[
#             ['How to setup a human base on Mars? Give short answer.'],
#             ['Explain theory of relativity to me like I’m 8 years old.'],
#             ['What is 9,000 * 9,000?'],
#             ['Write a pun-filled happy birthday message to my friend Alex.']
#             ],
#         cache_examples=False,
#                      )
    
#     gr.Markdown(LICENSE)
    
# if __name__ == "__main__":
#     demo.launch()