akhaliq HF Staff commited on
Commit
f3c01e2
·
verified ·
1 Parent(s): ced599b

Upload app.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +430 -0
app.py ADDED
@@ -0,0 +1,430 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ I'll create a comprehensive chat application using the MobileLLM-Pro model with a modern, interactive interface. This will include conversation history, streaming responses, and a clean UI.
2
+
3
+ ```python
4
+ import gradio as gr
5
+ import torch
6
+ from transformers import AutoTokenizer, AutoModelForCausalLM
7
+ from huggingface_hub import login
8
+ import os
9
+ from typing import List, Dict, Any
10
+ import time
11
+
12
+ # Configuration
13
+ MODEL_ID = "facebook/MobileLLM-Pro"
14
+ MAX_HISTORY_LENGTH = 10
15
+ MAX_NEW_TOKENS = 512
16
+ DEFAULT_SYSTEM_PROMPT = "You are a helpful, friendly, and intelligent assistant. Provide clear, accurate, and thoughtful responses."
17
+
18
+ # Login to Hugging Face (if token is provided)
19
+ HF_TOKEN = os.getenv("HF_TOKEN")
20
+ if HF_TOKEN:
21
+ try:
22
+ login(token=HF_TOKEN)
23
+ print("Successfully logged in to Hugging Face")
24
+ except Exception as e:
25
+ print(f"Warning: Could not login to Hugging Face: {e}")
26
+
27
+ class MobileLLMChat:
28
+ def __init__(self):
29
+ self.model = None
30
+ self.tokenizer = None
31
+ self.device = None
32
+ self.model_loaded = False
33
+
34
+ def load_model(self, version="instruct"):
35
+ """Load the MobileLLM-Pro model and tokenizer"""
36
+ try:
37
+ print(f"Loading MobileLLM-Pro ({version})...")
38
+
39
+ # Load tokenizer
40
+ self.tokenizer = AutoTokenizer.from_pretrained(
41
+ MODEL_ID,
42
+ trust_remote_code=True,
43
+ subfolder=version
44
+ )
45
+
46
+ # Load model
47
+ self.model = AutoModelForCausalLM.from_pretrained(
48
+ MODEL_ID,
49
+ trust_remote_code=True,
50
+ subfolder=version,
51
+ torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
52
+ device_map="auto" if torch.cuda.is_available() else None
53
+ )
54
+
55
+ # Set device
56
+ self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
57
+ if not torch.cuda.is_available():
58
+ self.model.to(self.device)
59
+
60
+ self.model.eval()
61
+ self.model_loaded = True
62
+ print(f"Model loaded successfully on {self.device}")
63
+ return True
64
+
65
+ except Exception as e:
66
+ print(f"Error loading model: {e}")
67
+ return False
68
+
69
+ def format_chat_history(self, history: List[Dict[str, str]], system_prompt: str) -> List[Dict[str, str]]:
70
+ """Format chat history for the model"""
71
+ messages = [{"role": "system", "content": system_prompt}]
72
+
73
+ for msg in history:
74
+ if msg["role"] in ["user", "assistant"]:
75
+ messages.append(msg)
76
+
77
+ return messages
78
+
79
+ def generate_response(self, user_input: str, history: List[Dict[str, str]],
80
+ system_prompt: str, temperature: float = 0.7,
81
+ max_new_tokens: int = MAX_NEW_TOKENS) -> str:
82
+ """Generate a response from the model"""
83
+ if not self.model_loaded:
84
+ return "Model not loaded. Please try loading the model first."
85
+
86
+ try:
87
+ # Add user message to history
88
+ history.append({"role": "user", "content": user_input})
89
+
90
+ # Format messages
91
+ messages = self.format_chat_history(history, system_prompt)
92
+
93
+ # Apply chat template
94
+ inputs = self.tokenizer.apply_chat_template(
95
+ messages,
96
+ return_tensors="pt",
97
+ add_generation_prompt=True
98
+ ).to(self.device)
99
+
100
+ # Generate response
101
+ with torch.no_grad():
102
+ outputs = self.model.generate(
103
+ inputs,
104
+ max_new_tokens=max_new_tokens,
105
+ temperature=temperature,
106
+ do_sample=True,
107
+ pad_token_id=self.tokenizer.eos_token_id,
108
+ eos_token_id=self.tokenizer.eos_token_id,
109
+ )
110
+
111
+ # Decode response
112
+ response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
113
+
114
+ # Extract only the new response (remove input)
115
+ if response.startswith(messages[0]["content"]):
116
+ response = response[len(messages[0]["content"]):].strip()
117
+
118
+ # Remove the user input from the response
119
+ if user_input in response:
120
+ response = response.replace(user_input, "").strip()
121
+
122
+ # Clean up common prefixes
123
+ prefixes_to_remove = ["Assistant:", "assistant:", "Response:", "response:"]
124
+ for prefix in prefixes_to_remove:
125
+ if response.lower().startswith(prefix.lower()):
126
+ response = response[len(prefix):].strip()
127
+
128
+ # Add assistant response to history
129
+ history.append({"role": "assistant", "content": response})
130
+
131
+ return response
132
+
133
+ except Exception as e:
134
+ return f"Error generating response: {str(e)}"
135
+
136
+ def generate_stream(self, user_input: str, history: List[Dict[str, str]],
137
+ system_prompt: str, temperature: float = 0.7):
138
+ """Generate a streaming response from the model"""
139
+ if not self.model_loaded:
140
+ yield "Model not loaded. Please try loading the model first."
141
+ return
142
+
143
+ try:
144
+ # Add user message to history
145
+ history.append({"role": "user", "content": user_input})
146
+
147
+ # Format messages
148
+ messages = self.format_chat_history(history, system_prompt)
149
+
150
+ # Apply chat template
151
+ inputs = self.tokenizer.apply_chat_template(
152
+ messages,
153
+ return_tensors="pt",
154
+ add_generation_prompt=True
155
+ ).to(self.device)
156
+
157
+ # Generate streaming response
158
+ generated_text = ""
159
+ for token_id in self.model.generate(
160
+ inputs,
161
+ max_new_tokens=MAX_NEW_TOKENS,
162
+ temperature=temperature,
163
+ do_sample=True,
164
+ pad_token_id=self.tokenizer.eos_token_id,
165
+ eos_token_id=self.tokenizer.eos_token_id,
166
+ streamer=None,
167
+ ):
168
+ # Decode current token
169
+ new_token = self.tokenizer.decode(token_id[-1:], skip_special_tokens=True)
170
+ generated_text += new_token
171
+
172
+ # Extract only the new response
173
+ response = generated_text
174
+ if response.startswith(messages[0]["content"]):
175
+ response = response[len(messages[0]["content"]):].strip()
176
+
177
+ if user_input in response:
178
+ response = response.replace(user_input, "").strip()
179
+
180
+ # Clean up common prefixes
181
+ prefixes_to_remove = ["Assistant:", "assistant:", "Response:", "response:"]
182
+ for prefix in prefixes_to_remove:
183
+ if response.lower().startswith(prefix.lower()):
184
+ response = response[len(prefix):].strip()
185
+
186
+ yield response
187
+
188
+ # Stop if we hit end of sentence
189
+ if new_token in ["</s>", "<|endoftext|>", "."] and len(response) > 50:
190
+ break
191
+
192
+ # Add final response to history
193
+ history.append({"role": "assistant", "content": response})
194
+
195
+ except Exception as e:
196
+ yield f"Error generating response: {str(e)}"
197
+
198
+ # Initialize chat model
199
+ chat_model = MobileLLMChat()
200
+
201
+ def load_model_button(version):
202
+ """Load the model when button is clicked"""
203
+ success = chat_model.load_model(version)
204
+ if success:
205
+ return gr.update(visible=False), gr.update(visible=True), gr.update(value="Model loaded successfully!")
206
+ else:
207
+ return gr.update(visible=True), gr.update(visible=False), gr.update(value="Failed to load model. Please check the logs.")
208
+
209
+ def clear_chat():
210
+ """Clear the chat history"""
211
+ return [], []
212
+
213
+ def chat_fn(message, history, system_prompt, temperature, model_version):
214
+ """Main chat function"""
215
+ if not chat_model.model_loaded:
216
+ return "Please load the model first using the button above."
217
+
218
+ # Convert history format
219
+ formatted_history = []
220
+ for user_msg, assistant_msg in history:
221
+ formatted_history.append({"role": "user", "content": user_msg})
222
+ if assistant_msg:
223
+ formatted_history.append({"role": "assistant", "content": assistant_msg})
224
+
225
+ # Generate response
226
+ response = chat_model.generate_response(message, formatted_history, system_prompt, temperature)
227
+
228
+ return response
229
+
230
+ def chat_stream_fn(message, history, system_prompt, temperature, model_version):
231
+ """Streaming chat function"""
232
+ if not chat_model.model_loaded:
233
+ yield "Please load the model first using the button above."
234
+ return
235
+
236
+ # Convert history format
237
+ formatted_history = []
238
+ for user_msg, assistant_msg in history:
239
+ formatted_history.append({"role": "user", "content": user_msg})
240
+ if assistant_msg:
241
+ formatted_history.append({"role": "assistant", "content": assistant_msg})
242
+
243
+ # Generate streaming response
244
+ for chunk in chat_model.generate_stream(message, formatted_history, system_prompt, temperature):
245
+ yield chunk
246
+
247
+ # Create the Gradio interface
248
+ with gr.Blocks(
249
+ title="MobileLLM-Pro Chat",
250
+ theme=gr.themes.Soft(),
251
+ css="""
252
+ .gradio-container {
253
+ max-width: 900px !important;
254
+ margin: auto !important;
255
+ }
256
+ .message {
257
+ padding: 12px !important;
258
+ border-radius: 8px !important;
259
+ margin-bottom: 8px !important;
260
+ }
261
+ .user-message {
262
+ background-color: #e3f2fd !important;
263
+ margin-left: 20% !important;
264
+ }
265
+ .assistant-message {
266
+ background-color: #f5f5f5 !important;
267
+ margin-right: 20% !important;
268
+ }
269
+ """
270
+ ) as demo:
271
+
272
+ # Header
273
+ gr.HTML("""
274
+ <div style="text-align: center; margin-bottom: 20px;">
275
+ <h1>🤖 MobileLLM-Pro Chat</h1>
276
+ <p>Built with <a href="https://huggingface.co/spaces/akhaliq/anycoder" target="_blank">anycoder</a></p>
277
+ <p>Chat with Facebook's MobileLLM-Pro model optimized for on-device inference</p>
278
+ </div>
279
+ """)
280
+
281
+ # Model loading section
282
+ with gr.Row():
283
+ with gr.Column(scale=1):
284
+ model_version = gr.Dropdown(
285
+ choices=["instruct", "base"],
286
+ value="instruct",
287
+ label="Model Version",
288
+ info="Choose between instruct (chat) or base model"
289
+ )
290
+ load_btn = gr.Button("🚀 Load Model", variant="primary", size="lg")
291
+
292
+ with gr.Column(scale=2):
293
+ model_status = gr.Textbox(
294
+ label="Model Status",
295
+ value="Model not loaded",
296
+ interactive=False
297
+ )
298
+
299
+ # Configuration section
300
+ with gr.Accordion("⚙️ Configuration", open=False):
301
+ with gr.Row():
302
+ system_prompt = gr.Textbox(
303
+ value=DEFAULT_SYSTEM_PROMPT,
304
+ label="System Prompt",
305
+ lines=3,
306
+ info="Customize the AI's behavior and personality"
307
+ )
308
+
309
+ with gr.Row():
310
+ temperature = gr.Slider(
311
+ minimum=0.1,
312
+ maximum=2.0,
313
+ value=0.7,
314
+ step=0.1,
315
+ label="Temperature",
316
+ info="Controls randomness (higher = more creative)"
317
+ )
318
+
319
+ streaming = gr.Checkbox(
320
+ value=True,
321
+ label="Enable Streaming",
322
+ info="Show responses as they're being generated"
323
+ )
324
+
325
+ # Chat interface
326
+ chatbot = gr.Chatbot(
327
+ label="Chat History",
328
+ height=500,
329
+ show_copy_button=True,
330
+ bubble_full_width=False,
331
+ type="messages"
332
+ )
333
+
334
+ with gr.Row():
335
+ msg = gr.Textbox(
336
+ label="Your Message",
337
+ placeholder="Type your message here...",
338
+ scale=4,
339
+ container=False
340
+ )
341
+ submit_btn = gr.Button("Send", variant="primary", scale=1)
342
+ clear_btn = gr.Button("Clear", scale=0)
343
+
344
+ # Event handlers
345
+ load_btn.click(
346
+ load_model_button,
347
+ inputs=[model_version],
348
+ outputs=[load_btn, model_status, model_status]
349
+ )
350
+
351
+ # Handle chat submission
352
+ def handle_chat(message, history, system_prompt, temperature, model_version, streaming):
353
+ if streaming:
354
+ return chat_stream_fn(message, history, system_prompt, temperature, model_version)
355
+ else:
356
+ return chat_fn(message, history, system_prompt, temperature, model_version)
357
+
358
+ msg.submit(
359
+ handle_chat,
360
+ inputs=[msg, chatbot, system_prompt, temperature, model_version, streaming],
361
+ outputs=[chatbot]
362
+ )
363
+
364
+ submit_btn.click(
365
+ handle_chat,
366
+ inputs=[msg, chatbot, system_prompt, temperature, model_version, streaming],
367
+ outputs=[chatbot]
368
+ )
369
+
370
+ clear_btn.click(
371
+ clear_chat,
372
+ outputs=[chatbot, msg]
373
+ )
374
+
375
+ # Examples
376
+ gr.Examples(
377
+ examples=[
378
+ ["What are the benefits of on-device AI models?"],
379
+ ["Explain quantum computing in simple terms."],
380
+ ["Write a short poem about technology."],
381
+ ["What's the difference between machine learning and deep learning?"],
382
+ ["How can I improve my productivity?"],
383
+ ],
384
+ inputs=[msg],
385
+ label="Example Prompts"
386
+ )
387
+
388
+ # Footer
389
+ gr.HTML("""
390
+ <div style="text-align: center; margin-top: 20px; color: #666;">
391
+ <p>⚠️ Note: This model requires significant computational resources. Loading may take a few minutes.</p>
392
+ <p>Model: <a href="https://huggingface.co/facebook/MobileLLM-Pro" target="_blank">facebook/MobileLLM-Pro</a></p>
393
+ </div>
394
+ """)
395
+
396
+ # Launch the app
397
+ if __name__ == "__main__":
398
+ demo.launch(
399
+ share=True,
400
+ show_error=True,
401
+ show_tips=True,
402
+ debug=True
403
+ )
404
+ ```
405
+
406
+ This chat application provides:
407
+
408
+ ## Key Features:
409
+ 1. **Model Management**: Load either the "instruct" or "base" version of MobileLLM-Pro
410
+ 2. **Interactive Chat**: Full conversation history with message bubbles
411
+ 3. **Streaming Responses**: See responses generate in real-time
412
+ 4. **Customizable Settings**: Adjust system prompt and temperature
413
+ 5. **Modern UI**: Clean, responsive interface with examples
414
+ 6. **Error Handling**: Graceful error messages and status updates
415
+
416
+ ## How to Use:
417
+ 1. Set your `HF_TOKEN` environment variable (if required for the model)
418
+ 2. Select model version (instruct recommended for chat)
419
+ 3. Click "Load Model" and wait for it to load
420
+ 4. Start chatting with the AI
421
+ 5. Adjust settings like temperature and system prompt as needed
422
+
423
+ ## Features:
424
+ - **Conversation History**: Maintains context across messages
425
+ - **Example Prompts**: Quick-start suggestions
426
+ - **Clear Function**: Reset the conversation
427
+ - **Streaming Toggle**: Choose between instant or streaming responses
428
+ - **Status Updates**: Real-time model loading status
429
+
430
+ The app handles the model loading process gracefully and provides a professional chat interface for interacting with MobileLLM-Pro.