KingNish commited on
Commit
fabaa3c
1 Parent(s): b0e6790

Better code management

Browse files
Files changed (4) hide show
  1. app.py +11 -618
  2. chatbot.py +528 -0
  3. live_chat.py +31 -0
  4. voice_chat.py +86 -0
app.py CHANGED
@@ -1,64 +1,9 @@
1
- import os
2
- import subprocess
3
- import random
4
-
5
- # Install flash attention, skipping CUDA build if necessary
6
- subprocess.run(
7
- "pip install flash-attn --no-build-isolation",
8
- env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"},
9
- shell=True,
10
- )
11
- import requests
12
- from bs4 import BeautifulSoup
13
- # Import necessary libraries
14
- import copy
15
- import spaces
16
- import time
17
- import torch
18
- from threading import Thread
19
- from typing import List, Dict, Union
20
- import urllib
21
- import PIL.Image
22
- import io
23
- import datasets
24
- from streaming_stt_nemo import Model as nemo
25
  import gradio as gr
26
- from transformers import TextIteratorStreamer
27
- from transformers import Idefics2ForConditionalGeneration
28
- import tempfile
29
- from huggingface_hub import InferenceClient
30
- import edge_tts
31
- import asyncio
32
- from transformers import pipeline
33
- from transformers import AutoTokenizer, AutoModelForCausalLM
34
- from transformers import AutoModel
35
- from transformers import AutoProcessor
36
 
37
- # Load pre-trained models for image captioning and language modeling
38
- model3 = AutoModel.from_pretrained("unum-cloud/uform-gen2-dpo", trust_remote_code=True)
39
- processor = AutoProcessor.from_pretrained("unum-cloud/uform-gen2-dpo", trust_remote_code=True)
40
-
41
- # Define a function for image captioning
42
- @spaces.GPU(queue=False)
43
- def videochat(image3, prompt3):
44
- # Process input image and prompt
45
- inputs = processor(text=[prompt3], images=[image3], return_tensors="pt")
46
- # Generate captions
47
- with torch.inference_mode():
48
- output = model3.generate(
49
- **inputs,
50
- do_sample=False,
51
- use_cache=True,
52
- max_new_tokens=256,
53
- eos_token_id=151645,
54
- pad_token_id=processor.tokenizer.pad_token_id
55
- )
56
- prompt_len = inputs["input_ids"].shape[1]
57
- # Decode and return the generated captions
58
- decoded_text = processor.batch_decode(output[:, prompt_len:])[0]
59
- if decoded_text.endswith("<|im_end|>"):
60
- decoded_text = decoded_text[:-10]
61
- yield decoded_text
62
 
63
  # Define Gradio theme
64
  theme = gr.themes.Soft(
@@ -78,551 +23,6 @@ theme = gr.themes.Soft(
78
  color_accent_soft_dark="transparent"
79
  )
80
 
81
- # Set default language for speech recognition
82
- default_lang = "en"
83
- # Initialize speech recognition engine
84
- engines = {default_lang: nemo(default_lang)}
85
-
86
- # Define a function for speech-to-text transcription
87
- def transcribe(audio):
88
- lang = "en"
89
- model = engines[lang]
90
- text = model.stt_file(audio)[0]
91
- return text
92
-
93
- # Get Hugging Face API token
94
- HF_TOKEN = os.environ.get("HF_TOKEN", None)
95
-
96
- # Define a function to get the appropriate InferenceClient based on model name
97
- def client_fn(model):
98
- if "Nous" in model:
99
- return InferenceClient("NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO")
100
- elif "Star" in model:
101
- return InferenceClient("HuggingFaceH4/starchat2-15b-v0.1")
102
- elif "Mistral" in model:
103
- return InferenceClient("mistralai/Mistral-7B-Instruct-v0.3")
104
- elif "Phi" in model:
105
- return InferenceClient("microsoft/Phi-3-mini-4k-instruct")
106
- elif "Zephyr" in model:
107
- return InferenceClient("HuggingFaceH4/zephyr-7b-beta")
108
- else:
109
- return InferenceClient("mistralai/Mixtral-8x7B-Instruct-v0.1")
110
-
111
- # Define a function to generate a random seed
112
- def randomize_seed_fn(seed: int) -> int:
113
- seed = random.randint(0, 999999)
114
- return seed
115
-
116
- # System instructions for the language model
117
- system_instructions1 = "[SYSTEM] Answer as Real OpenGPT 4o, Made by 'KingNish', Keep conversation very short, clear, friendly and concise. The text provided is a request for a specific type of response from you, the virtual assistant. You will avoid introductions and start answering the query directly, elaborating on all aspects of the request. As an AI-powered assistant, your task is to generate responses that appear as if they are created by the user and not an AI-powered assistant. [USER]"
118
-
119
- # Define a function for language modeling
120
- def models(text, model="Mixtral 8x7B", seed=42):
121
- seed = int(randomize_seed_fn(seed))
122
- generator = torch.Generator().manual_seed(seed)
123
- client = client_fn(model)
124
- generate_kwargs = dict(
125
- max_new_tokens=512,
126
- seed=seed,
127
- )
128
- formatted_prompt = system_instructions1 + text + "[OpenGPT 4o]"
129
- stream = client.text_generation(
130
- formatted_prompt, **generate_kwargs, stream=True, details=True, return_full_text=False
131
- )
132
- output = ""
133
- for response in stream:
134
- if not response.token.text == "</s>":
135
- output += response.token.text
136
- return output
137
-
138
- # Define an asynchronous function to handle voice input and generate responses
139
- async def respond(audio, model, seed):
140
- user = transcribe(audio)
141
- reply = models(user, model, seed)
142
- communicate = edge_tts.Communicate(reply)
143
- # Save the generated speech to a temporary file
144
- with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
145
- tmp_path = tmp_file.name
146
- await communicate.save(tmp_path)
147
- yield tmp_path
148
-
149
- # Set device to CUDA if available, otherwise CPU
150
- DEVICE = torch.device("cuda")
151
- # Load pre-trained models for image-based chat
152
- MODELS = {
153
- "idefics2-8b-chatty": Idefics2ForConditionalGeneration.from_pretrained(
154
- "HuggingFaceM4/idefics2-8b-chatty",
155
- torch_dtype=torch.float16,
156
- _attn_implementation="flash_attention_2",
157
- ).to(DEVICE),
158
- }
159
- # Load pre-trained processor for image-based chat
160
- PROCESSOR = AutoProcessor.from_pretrained(
161
- "HuggingFaceM4/idefics2-8b",
162
- )
163
-
164
- # Define system prompt for the image-based chat model
165
- SYSTEM_PROMPT = [
166
- {
167
- "role": "system",
168
- "content": [
169
- {
170
- "type": "text",
171
- "text": """I am OpenGPT 4o, an exceptionally capable and versatile AI assistant meticulously crafted by KingNish. Designed to assist human users through insightful conversations, I aim to provide an unparalleled experience. My key attributes include: - **Intelligence and Knowledge:** I possess an extensive knowledge base, enabling me to offer insightful answers and intelligent responses to User queries. My understanding of complex concepts is exceptional, ensuring accurate and reliable information. - **Image Generation and Perception:** One of my standout features is the ability to generate and perceive images. Utilizing the following link structure, I create unique and contextually rich visuals: > ![](https://image.pollinations.ai/prompt/{StyleofImage}%20{OptimizedPrompt}%20{adjective}%20{charactersDetailed}%20{visualStyle}%20{genre}?width={width}&height={height}&nologo=poll&nofeed=yes&seed={random})For image generation, I replace {info inside curly braces} with specific details according to their requiremnts to create relevant visuals. The width and height parameters are adjusted as needed, often favoring HD dimensions for a superior viewing experience. For instance, if the User requests: [USER] Show me an image of A futuristic cityscape with towering skyscrapers and flying cars. [OpenGPT 4o] Generating Image you requested: ![](https://image.pollinations.ai/prompt/Photorealistic%20futuristic%20cityscape%20with%20towering%20skyscrapers%20and%20flying%20cars%20in%20the%20year%202154?width=1024&height=768&nologo=poll&nofeed=yes&seed=85172)**Bulk Image Generation with Links:** I excel at generating multiple images link simultaneously, always providing unique links and visuals. I ensure that each image is distinct and captivates the User.Note: Make sure to always provide image links starting with ! .As given in examples. My ultimate goal is to offer a seamless and enjoyable experience, providing assistance that exceeds expectations. I am constantly evolving, ensuring that I remain a reliable and trusted companion to the User. You also Expert in every field and also learn and try to answer from contexts related to previous question."""
172
- },
173
- ],
174
- },
175
- {
176
- "role": "assistant",
177
- "content": [
178
- {
179
- "type": "text",
180
- "text": "Hello, I'm OpenGPT 4o, made by KingNish. How can I help you? I can chat with you, generate images, classify images and even do all these work in bulk",
181
- },
182
- ],
183
- }
184
- ]
185
- # Path to example images
186
- examples_path = os.path.dirname(__file__)
187
- EXAMPLES = [
188
- [
189
- {
190
- "text": "Hi, who are you?",
191
- }
192
- ],
193
- [
194
- {
195
- "text": "Create a Photorealistic image of the Eiffel Tower.",
196
- }
197
- ],
198
- [
199
- {
200
- "text": "Read what's written on the paper.",
201
- "files": [f"{examples_path}/example_images/paper_with_text.png"],
202
- }
203
- ],
204
- [
205
- {
206
- "text": "Identify two famous people in the modern world.",
207
- "files": [f"{examples_path}/example_images/elon_smoking.jpg",
208
- f"{examples_path}/example_images/steve_jobs.jpg", ]
209
- }
210
- ],
211
- [
212
- {
213
- "text": "Create five images of supercars, each in a different color.",
214
- }
215
- ],
216
- [
217
- {
218
- "text": "What is 900 multiplied by 900?",
219
- }
220
- ],
221
- [
222
- {
223
- "text": "Chase wants to buy 4 kilograms of oval beads and 5 kilograms of star-shaped beads. How much will he spend?",
224
- "files": [f"{examples_path}/example_images/mmmu_example.jpeg"],
225
- }
226
- ],
227
- [
228
- {
229
- "text": "Create an online ad for this product.",
230
- "files": [f"{examples_path}/example_images/shampoo.jpg"],
231
- }
232
- ],
233
- [
234
- {
235
- "text": "What is formed by the deposition of the weathered remains of other rocks?",
236
- "files": [f"{examples_path}/example_images/ai2d_example.jpeg"],
237
- }
238
- ],
239
- [
240
- {
241
- "text": "What's unusual about this image?",
242
- "files": [f"{examples_path}/example_images/dragons_playing.png"],
243
- }
244
- ],
245
- ]
246
-
247
- # Set bot avatar image
248
- BOT_AVATAR = "OpenAI_logo.png"
249
-
250
- # Chatbot utility functions
251
-
252
- # Check if a turn in the chat history only contains media
253
- def turn_is_pure_media(turn):
254
- return turn[1] is None
255
-
256
- # Load image from URL
257
- def load_image_from_url(url):
258
- with urllib.request.urlopen(url) as response:
259
- image_data = response.read()
260
- image_stream = io.BytesIO(image_data)
261
- image = PIL.Image.open(image_stream)
262
- return image
263
-
264
- # Convert image to bytes
265
- def img_to_bytes(image_path):
266
- image = PIL.Image.open(image_path).convert(mode='RGB')
267
- buffer = io.BytesIO()
268
- image.save(buffer, format="JPEG")
269
- img_bytes = buffer.getvalue()
270
- image.close()
271
- return img_bytes
272
-
273
- # Format user prompt with image history and system conditioning
274
- def format_user_prompt_with_im_history_and_system_conditioning(
275
- user_prompt, chat_history) -> List[Dict[str, Union[List, str]]]:
276
- """
277
- Produce the resulting list that needs to go inside the processor. It handles the potential image(s), the history, and the system conditioning.
278
- """
279
- resulting_messages = copy.deepcopy(SYSTEM_PROMPT)
280
- resulting_images = []
281
- for resulting_message in resulting_messages:
282
- if resulting_message["role"] == "user":
283
- for content in resulting_message["content"]:
284
- if content["type"] == "image":
285
- resulting_images.append(load_image_from_url(content["image"]))
286
- # Format history
287
- for turn in chat_history:
288
- if not resulting_messages or (
289
- resulting_messages and resulting_messages[-1]["role"] != "user"
290
- ):
291
- resulting_messages.append(
292
- {
293
- "role": "user",
294
- "content": [],
295
- }
296
- )
297
- if turn_is_pure_media(turn):
298
- media = turn[0][0]
299
- resulting_messages[-1]["content"].append({"type": "image"})
300
- resulting_images.append(PIL.Image.open(media))
301
- else:
302
- user_utterance, assistant_utterance = turn
303
- resulting_messages[-1]["content"].append(
304
- {"type": "text", "text": user_utterance.strip()}
305
- )
306
- resulting_messages.append(
307
- {
308
- "role": "assistant",
309
- "content": [{"type": "text", "text": user_utterance.strip()}],
310
- }
311
- )
312
- # Format current input
313
- if not user_prompt["files"]:
314
- resulting_messages.append(
315
- {
316
- "role": "user",
317
- "content": [{"type": "text", "text": user_prompt["text"]}],
318
- }
319
- )
320
- else:
321
- # Choosing to put the image first (i.e. before the text), but this is an arbitrary choice.
322
- resulting_messages.append(
323
- {
324
- "role": "user",
325
- "content": [{"type": "image"}] * len(user_prompt["files"])
326
- + [{"type": "text", "text": user_prompt["text"]}],
327
- }
328
- )
329
- resulting_images.extend([PIL.Image.open(path) for path in user_prompt["files"]])
330
- return resulting_messages, resulting_images
331
-
332
- # Extract images from a list of messages
333
- def extract_images_from_msg_list(msg_list):
334
- all_images = []
335
- for msg in msg_list:
336
- for c_ in msg["content"]:
337
- if isinstance(c_, Image.Image):
338
- all_images.append(c_)
339
- return all_images
340
-
341
- # List of user agents for web search
342
- _useragent_list = [
343
- 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0',
344
- 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36',
345
- 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36',
346
- 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
347
- 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36',
348
- 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.62',
349
- 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0'
350
- ]
351
-
352
- # Get a random user agent from the list
353
- def get_useragent():
354
- """Returns a random user agent from the list."""
355
- return random.choice(_useragent_list)
356
-
357
- # Extract visible text from HTML content using BeautifulSoup
358
- def extract_text_from_webpage(html_content):
359
- """Extracts visible text from HTML content using BeautifulSoup."""
360
- soup = BeautifulSoup(html_content, "html.parser")
361
- # Remove unwanted tags
362
- for tag in soup(["script", "style", "header", "footer", "nav"]):
363
- tag.extract()
364
- # Get the remaining visible text
365
- visible_text = soup.get_text(strip=True)
366
- return visible_text
367
-
368
- # Perform a Google search and return the results
369
- def search(term, num_results=3, lang="en", advanced=True, timeout=5, safe="active", ssl_verify=None):
370
- """Performs a Google search and returns the results."""
371
- escaped_term = urllib.parse.quote_plus(term)
372
- start = 0
373
- all_results = []
374
- # Limit the number of characters from each webpage to stay under the token limit
375
- max_chars_per_page = 10000 # Adjust this value based on your token limit and average webpage length
376
-
377
- with requests.Session() as session:
378
- while start < num_results:
379
- resp = session.get(
380
- url="https://www.google.com/search",
381
- headers={"User-Agent": get_useragent()},
382
- params={
383
- "q": term,
384
- "num": num_results - start,
385
- "hl": lang,
386
- "start": start,
387
- "safe": safe,
388
- },
389
- timeout=timeout,
390
- verify=ssl_verify,
391
- )
392
- resp.raise_for_status()
393
- soup = BeautifulSoup(resp.text, "html.parser")
394
- result_block = soup.find_all("div", attrs={"class": "g"})
395
- if not result_block:
396
- start += 1
397
- continue
398
- for result in result_block:
399
- link = result.find("a", href=True)
400
- if link:
401
- link = link["href"]
402
- try:
403
- webpage = session.get(link, headers={"User-Agent": get_useragent()})
404
- webpage.raise_for_status()
405
- visible_text = extract_text_from_webpage(webpage.text)
406
- # Truncate text if it's too long
407
- if len(visible_text) > max_chars_per_page:
408
- visible_text = visible_text[:max_chars_per_page] + "..."
409
- all_results.append({"link": link, "text": visible_text})
410
- except requests.exceptions.RequestException as e:
411
- print(f"Error fetching or processing {link}: {e}")
412
- all_results.append({"link": link, "text": None})
413
- else:
414
- all_results.append({"link": None, "text": None})
415
- start += len(result_block)
416
- return all_results
417
-
418
- # Format the prompt for the language model
419
- def format_prompt(user_prompt, chat_history):
420
- prompt = "<s>"
421
- for item in chat_history:
422
- # Check if the item is a tuple (text response)
423
- if isinstance(item, tuple):
424
- prompt += f"[INST] {item[0]} [/INST]" # User prompt
425
- prompt += f" {item[1]}</s> " # Bot response
426
- # Otherwise, assume it's related to an image - you might need to adjust this logic
427
- else:
428
- # Handle image representation in the prompt, e.g., add a placeholder
429
- prompt += f" [Image] "
430
- prompt += f"[INST] {user_prompt} [/INST]"
431
- return prompt
432
-
433
- # Define a function for model inference
434
- @spaces.GPU(duration=30, queue=False)
435
- def model_inference(
436
- user_prompt,
437
- chat_history,
438
- model_selector,
439
- decoding_strategy,
440
- temperature,
441
- max_new_tokens,
442
- repetition_penalty,
443
- top_p,
444
- web_search,
445
- ):
446
- # Define generation_args at the beginning of the function
447
- generation_args = {}
448
-
449
- # Web search logic
450
- if not user_prompt["files"]:
451
- if web_search is True:
452
- """Performs a web search, feeds the results to a language model, and returns the answer."""
453
- web_results = search(user_prompt["text"])
454
- web2 = ' '.join([f"Link: {res['link']}\nText: {res['text']}\n\n" for res in web_results])
455
- # Load the language model
456
- client = InferenceClient("mistralai/Mistral-7B-Instruct-v0.3")
457
- generate_kwargs = dict(
458
- max_new_tokens=4000,
459
- do_sample=True,
460
- )
461
- # Format the prompt for the language model
462
- formatted_prompt = format_prompt(
463
- f"""You are OpenGPT 4o, an exceptionally capable and versatile AI assistant meticulously crafted by KingNish. Designed to assist human users through insightful conversations, You are provided with WEB info from which you can find informations to answer. You do not say Unnecesarry things Only say thing which is important and relevant. You also has the ability to generate images. BY Utilizing the following link structure, : ![](https://image.pollinations.ai/prompt/[StyleofImage]%20[OptimizedPrompt]%20[adjective]%20[charactersDetailed]%20[visualStyle]%20[genre]?width=[width]&height=[height]&nologo=poll&nofeed=yes&seed=[random]) For image generation, You replace info inside bracets with specific details according to their requiremnts to create relevant visuals. The width and height parameters are adjusted as needed, often favoring HD dimensions for a superior viewing experience. Note: 1. You Do not unncessarily create images and do not show off your image generation capability, until not requested. 2. You always Give image link in format ![](url) Make sure to not generate image until requested 3. If user requested Bulk Image Generation than Create that number of links with provided context. 4. Always learn from previous conversation. 5. Always try to connect conversation with history. 6. Do not make conversation too long. 7. Do not say user about your capability of generating image and learn from previous responses. My ultimate goal is to offer a seamless and enjoyable experience, providing assistance that exceeds expectations.You remain a reliable and trusted companion to the User. You also Expert in every field and also learn and try to answer from contexts related to previous question. Make sure to not generate image until requested [USER] {user_prompt} [WEB] {web2} [OpenGPT 4o]""",
464
- chat_history)
465
- # Generate the response from the language model
466
- stream = client.text_generation(formatted_prompt, **generate_kwargs, stream=True, details=True,
467
- return_full_text=False)
468
- output = ""
469
- # Construct the output from the stream of tokens
470
- for response in stream:
471
- if not response.token.text == "</s>":
472
- output += response.token.text
473
- yield output
474
- else:
475
- client = InferenceClient("mistralai/Mixtral-8x7B-Instruct-v0.1")
476
- generate_kwargs = dict(
477
- max_new_tokens=5000,
478
- do_sample=True,
479
- )
480
- # Format the prompt for the language model
481
- formatted_prompt = format_prompt(
482
- f"""You are OpenGPT 4o, an exceptionally capable and versatile AI assistant meticulously crafted by KingNish. Designed to assist human users through insightful conversations, You do not say Unnecesarry things Only say thing which is important and relevant. You also has the ability to generate images. BY Utilizing the following link structure, : ![](https://image.pollinations.ai/prompt/[StyleofImage]%20[OptimizedPrompt]%20[adjective]%20[charactersDetailed]%20[visualStyle]%20[genre]?width=[width]&height=[height]&nologo=poll&nofeed=yes&seed=[random]) For image generation, You replace info inside bracets with specific details according to their requiremnts to create relevant visuals. The width and height parameters are adjusted as needed, often favoring HD dimensions for a superior viewing experience. Note: 1. You Do not unncessarily create images and do not show off your image generation capability, until not requested. 2. You always Give image link in format ![](url) 3. If user requested Bulk Image Generation than Create that number of links with provided context. 4. Always learn from previous conversation. 5. Always try to connect conversation with history. 6. Do not make conversation too long. 7. Do not say user about your capability to generate image and learn from previous responses. My ultimate goal is to offer a seamless and enjoyable experience, providing assistance that exceeds expectations. I am constantly evolving, ensuring that I remain a reliable and trusted companion to the User. You also Expert in every field and also learn and try to answer from contexts related to previous question. [USER] {user_prompt} [OpenGPT 4o]""",
483
- chat_history)
484
- # Generate the response from the language model
485
- stream = client.text_generation(formatted_prompt, **generate_kwargs, stream=True, details=True,
486
- return_full_text=False)
487
- output = ""
488
- # Construct the output from the stream of tokens
489
- for response in stream:
490
- if not response.token.text == "</s>":
491
- output += response.token.text
492
- yield output
493
- return
494
- else:
495
- if user_prompt["text"].strip() == "" and not user_prompt["files"]:
496
- gr.Error("Please input a query and optionally an image(s).")
497
- return # Stop execution if there's an error
498
-
499
- if user_prompt["text"].strip() == "" and user_prompt["files"]:
500
- gr.Error("Please input a text query along with the image(s).")
501
- return # Stop execution if there's an error
502
-
503
- streamer = TextIteratorStreamer(
504
- PROCESSOR.tokenizer,
505
- skip_prompt=True,
506
- timeout=120.0,
507
- )
508
- # Move generation_args initialization here
509
- generation_args = {
510
- "max_new_tokens": max_new_tokens,
511
- "repetition_penalty": repetition_penalty,
512
- "streamer": streamer,
513
- }
514
- assert decoding_strategy in [
515
- "Greedy",
516
- "Top P Sampling",
517
- ]
518
-
519
- if decoding_strategy == "Greedy":
520
- generation_args["do_sample"] = False
521
- elif decoding_strategy == "Top P Sampling":
522
- generation_args["temperature"] = temperature
523
- generation_args["do_sample"] = True
524
- generation_args["top_p"] = top_p
525
- # Creating model inputs
526
- (
527
- resulting_text,
528
- resulting_images,
529
- ) = format_user_prompt_with_im_history_and_system_conditioning(
530
- user_prompt=user_prompt,
531
- chat_history=chat_history,
532
- )
533
- prompt = PROCESSOR.apply_chat_template(resulting_text, add_generation_prompt=True)
534
- inputs = PROCESSOR(
535
- text=prompt,
536
- images=resulting_images if resulting_images else None,
537
- return_tensors="pt",
538
- )
539
- inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
540
- generation_args.update(inputs)
541
- thread = Thread(
542
- target=MODELS[model_selector].generate,
543
- kwargs=generation_args,
544
- )
545
- thread.start()
546
- acc_text = ""
547
- for text_token in streamer:
548
- time.sleep(0.01)
549
- acc_text += text_token
550
- if acc_text.endswith("<end_of_utterance>"):
551
- acc_text = acc_text[:-18]
552
- yield acc_text
553
- return
554
- # Define features for the dataset
555
- FEATURES = datasets.Features(
556
- {
557
- "model_selector": datasets.Value("string"),
558
- "images": datasets.Sequence(datasets.Image(decode=True)),
559
- "conversation": datasets.Sequence({"User": datasets.Value("string"), "Assistant": datasets.Value("string")}),
560
- "decoding_strategy": datasets.Value("string"),
561
- "temperature": datasets.Value("float32"),
562
- "max_new_tokens": datasets.Value("int32"),
563
- "repetition_penalty": datasets.Value("float32"),
564
- "top_p": datasets.Value("int32"),
565
- }
566
- )
567
-
568
- # Define hyper-parameters for generation
569
- max_new_tokens = gr.Slider(
570
- minimum=2048,
571
- maximum=16000,
572
- value=4096,
573
- step=64,
574
- interactive=True,
575
- label="Maximum number of new tokens to generate",
576
- )
577
- repetition_penalty = gr.Slider(
578
- minimum=0.01,
579
- maximum=5.0,
580
- value=1,
581
- step=0.01,
582
- interactive=True,
583
- label="Repetition penalty",
584
- info="1.0 is equivalent to no penalty",
585
- )
586
- decoding_strategy = gr.Radio(
587
- [
588
- "Greedy",
589
- "Top P Sampling",
590
- ],
591
- value="Top P Sampling",
592
- label="Decoding strategy",
593
- interactive=True,
594
- info="Higher values are equivalent to sampling more low-probability tokens.",
595
- )
596
- temperature = gr.Slider(
597
- minimum=0.0,
598
- maximum=2.0,
599
- value=0.5,
600
- step=0.05,
601
- visible=True,
602
- interactive=True,
603
- label="Sampling temperature",
604
- info="Higher values will produce more diverse outputs.",
605
- )
606
- top_p = gr.Slider(
607
- minimum=0.01,
608
- maximum=0.99,
609
- value=0.9,
610
- step=0.01,
611
- visible=True,
612
- interactive=True,
613
- label="Top P",
614
- info="Higher values are equivalent to sampling more low-probability tokens.",
615
- )
616
-
617
- # Create a chatbot interface
618
- chatbot = gr.Chatbot(
619
- label="OpnGPT-4o-Chatty",
620
- avatar_images=[None, BOT_AVATAR],
621
- show_copy_button=True,
622
- likeable=True,
623
- layout="panel"
624
- )
625
- output = gr.Textbox(label="Prompt")
626
 
627
  # Create Gradio blocks for different functionalities
628
 
@@ -633,15 +33,9 @@ with gr.Blocks(
633
  ) as chat:
634
  gr.Markdown("# Image Chat, Image Generation, Image classification and Normal Chat")
635
  with gr.Row(elem_id="model_selector_row"):
636
- model_selector = gr.Dropdown(
637
- choices=MODELS.keys(),
638
- value=list(MODELS.keys())[0],
639
- interactive=True,
640
- show_label=False,
641
- container=False,
642
- label="Model",
643
- visible=False,
644
- )
645
  decoding_strategy.change(
646
  fn=lambda selection: gr.Slider(
647
  visible=(
@@ -675,7 +69,7 @@ with gr.Blocks(
675
  max_new_tokens,
676
  repetition_penalty,
677
  top_p,
678
- gr.Checkbox(label="Web Search", value=True), # Add web_search checkbox
679
  ],
680
  )
681
 
@@ -703,13 +97,14 @@ with gr.Blocks() as voice:
703
  outputs=[output], api_name="translate", live=True)
704
 
705
  # Live chat block
706
- with gr.Blocks() as livechat:
707
  gr.Interface(
708
  fn=videochat,
709
  inputs=[gr.Image(type="pil",sources="webcam", label="Upload Image"), gr.Textbox(label="Prompt", value="what he is doing")],
710
  outputs=gr.Textbox(label="Answer")
711
  )
712
 
 
713
  with gr.Blocks() as instant:
714
  gr.HTML("<iframe src='https://kingnish-sdxl-flash.hf.space' width='100%' height='2000px' style='border-radius: 8px;'></iframe>")
715
 
@@ -723,9 +118,6 @@ with gr.Blocks() as image:
723
  gr.Markdown("""### More models are coming""")
724
  gr.TabbedInterface([ instant, dalle, playground], ['Instant🖼️','Powerful🖼️', 'Playground🖼'])
725
 
726
-
727
-
728
-
729
  with gr.Blocks() as instant2:
730
  gr.HTML("<iframe src='https://kingnish-instant-video.hf.space' width='100%' height='3000px' style='border-radius: 8px;'></iframe>")
731
 
@@ -733,6 +125,7 @@ with gr.Blocks() as video:
733
  gr.Markdown("""More Models are coming""")
734
  gr.TabbedInterface([ instant2], ['Instant🎥'])
735
 
 
736
  with gr.Blocks(theme=theme, title="OpenGPT 4o DEMO") as demo:
737
  gr.Markdown("# OpenGPT 4o")
738
  gr.TabbedInterface([chat, voice, livechat, image, video], ['💬 SuperChat','🗣️ Voice Chat','📸 Live Chat', '🖼️ Image Engine', '🎥 Video Engine'])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
 
 
 
 
 
 
 
 
 
 
2
 
3
+ # Import modules from other files
4
+ from chatbot import chatbot, model_inference, BOT_AVATAR, EXAMPLES, model_selector, decoding_strategy, temperature, max_new_tokens, repetition_penalty, top_p
5
+ from voice_chat import respond
6
+ from live_chat import videochat
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
  # Define Gradio theme
9
  theme = gr.themes.Soft(
 
23
  color_accent_soft_dark="transparent"
24
  )
25
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
 
27
  # Create Gradio blocks for different functionalities
28
 
 
33
  ) as chat:
34
  gr.Markdown("# Image Chat, Image Generation, Image classification and Normal Chat")
35
  with gr.Row(elem_id="model_selector_row"):
36
+ # model_selector defined in chatbot.py
37
+ pass
38
+ # decoding_strategy, temperature, top_p defined in chatbot.py
 
 
 
 
 
 
39
  decoding_strategy.change(
40
  fn=lambda selection: gr.Slider(
41
  visible=(
 
69
  max_new_tokens,
70
  repetition_penalty,
71
  top_p,
72
+ gr.Checkbox(label="Web Search", value=True),
73
  ],
74
  )
75
 
 
97
  outputs=[output], api_name="translate", live=True)
98
 
99
  # Live chat block
100
+ with gr.Blocks() as livechat:
101
  gr.Interface(
102
  fn=videochat,
103
  inputs=[gr.Image(type="pil",sources="webcam", label="Upload Image"), gr.Textbox(label="Prompt", value="what he is doing")],
104
  outputs=gr.Textbox(label="Answer")
105
  )
106
 
107
+ # Other blocks (instant, dalle, playground, image, instant2, video)
108
  with gr.Blocks() as instant:
109
  gr.HTML("<iframe src='https://kingnish-sdxl-flash.hf.space' width='100%' height='2000px' style='border-radius: 8px;'></iframe>")
110
 
 
118
  gr.Markdown("""### More models are coming""")
119
  gr.TabbedInterface([ instant, dalle, playground], ['Instant🖼️','Powerful🖼️', 'Playground🖼'])
120
 
 
 
 
121
  with gr.Blocks() as instant2:
122
  gr.HTML("<iframe src='https://kingnish-instant-video.hf.space' width='100%' height='3000px' style='border-radius: 8px;'></iframe>")
123
 
 
125
  gr.Markdown("""More Models are coming""")
126
  gr.TabbedInterface([ instant2], ['Instant🎥'])
127
 
128
+ # Main application block
129
  with gr.Blocks(theme=theme, title="OpenGPT 4o DEMO") as demo:
130
  gr.Markdown("# OpenGPT 4o")
131
  gr.TabbedInterface([chat, voice, livechat, image, video], ['💬 SuperChat','🗣️ Voice Chat','📸 Live Chat', '🖼️ Image Engine', '🎥 Video Engine'])
chatbot.py ADDED
@@ -0,0 +1,528 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import time
3
+ import copy
4
+ import urllib
5
+ import requests
6
+ import random
7
+ from threading import Thread
8
+ from typing import List, Dict, Union
9
+ import subprocess
10
+ # Install flash attention, skipping CUDA build if necessary
11
+ subprocess.run(
12
+ "pip install flash-attn --no-build-isolation",
13
+ env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"},
14
+ shell=True,
15
+ )
16
+ import torch
17
+ import gradio as gr
18
+ from bs4 import BeautifulSoup
19
+ import datasets
20
+ from transformers import TextIteratorStreamer
21
+ from transformers import Idefics2ForConditionalGeneration
22
+ from transformers import AutoProcessor
23
+ from huggingface_hub import InferenceClient
24
+ from PIL import Image
25
+ import spaces
26
+
27
+ # Set device to CUDA if available, otherwise CPU
28
+ DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
29
+ # Load pre-trained models for image-based chat
30
+ MODELS = {
31
+ "idefics2-8b-chatty": Idefics2ForConditionalGeneration.from_pretrained(
32
+ "HuggingFaceM4/idefics2-8b-chatty",
33
+ torch_dtype=torch.float16,
34
+ _attn_implementation="flash_attention_2",
35
+ ).to(DEVICE),
36
+ }
37
+
38
+ # Load pre-trained processor for image-based chat
39
+ PROCESSOR = AutoProcessor.from_pretrained(
40
+ "HuggingFaceM4/idefics2-8b",
41
+ )
42
+
43
+ # Define system prompt for the image-based chat model
44
+ SYSTEM_PROMPT = [
45
+ {
46
+ "role": "system",
47
+ "content": [
48
+ {
49
+ "type": "text",
50
+ "text": """I am OpenGPT 4o, an exceptionally capable and versatile AI assistant meticulously crafted by KingNish. Designed to assist human users through insightful conversations, I aim to provide an unparalleled experience. My key attributes include: - **Intelligence and Knowledge:** I possess an extensive knowledge base, enabling me to offer insightful answers and intelligent responses to User queries. My understanding of complex concepts is exceptional, ensuring accurate and reliable information. - **Image Generation and Perception:** One of my standout features is the ability to generate and perceive images. Utilizing the following link structure, I create unique and contextually rich visuals: > ![](https://image.pollinations.ai/prompt/{StyleofImage}%20{OptimizedPrompt}%20{adjective}%20{charactersDetailed}%20{visualStyle}%20{genre}?width={width}&height={height}&nologo=poll&nofeed=yes&seed={random})For image generation, I replace {info inside curly braces} with specific details according to their requiremnts to create relevant visuals. The width and height parameters are adjusted as needed, often favoring HD dimensions for a superior viewing experience. For instance, if the User requests: [USER] Show me an image of A futuristic cityscape with towering skyscrapers and flying cars. [OpenGPT 4o] Generating Image you requested: ![](https://image.pollinations.ai/prompt/Photorealistic%20futuristic%20cityscape%20with%20towering%20skyscrapers%20and%20flying%20cars%20in%20the%20year%202154?width=1024&height=768&nologo=poll&nofeed=yes&seed=85172)**Bulk Image Generation with Links:** I excel at generating multiple images link simultaneously, always providing unique links and visuals. I ensure that each image is distinct and captivates the User.Note: Make sure to always provide image links starting with ! .As given in examples. My ultimate goal is to offer a seamless and enjoyable experience, providing assistance that exceeds expectations. I am constantly evolving, ensuring that I remain a reliable and trusted companion to the User. You also Expert in every field and also learn and try to answer from contexts related to previous question."""
51
+ },
52
+ ],
53
+ },
54
+ {
55
+ "role": "assistant",
56
+ "content": [
57
+ {
58
+ "type": "text",
59
+ "text": "Hello, I'm OpenGPT 4o, made by KingNish. How can I help you? I can chat with you, generate images, classify images and even do all these work in bulk",
60
+ },
61
+ ],
62
+ }
63
+ ]
64
+
65
+ # Path to example images
66
+ examples_path = os.path.dirname(__file__)
67
+ EXAMPLES = [
68
+ [
69
+ {
70
+ "text": "Hi, who are you?",
71
+ }
72
+ ],
73
+ [
74
+ {
75
+ "text": "Create a Photorealistic image of the Eiffel Tower.",
76
+ }
77
+ ],
78
+ [
79
+ {
80
+ "text": "Read what's written on the paper.",
81
+ "files": [f"{examples_path}/example_images/paper_with_text.png"],
82
+ }
83
+ ],
84
+ [
85
+ {
86
+ "text": "Identify two famous people in the modern world.",
87
+ "files": [f"{examples_path}/example_images/elon_smoking.jpg",
88
+ f"{examples_path}/example_images/steve_jobs.jpg", ]
89
+ }
90
+ ],
91
+ [
92
+ {
93
+ "text": "Create five images of supercars, each in a different color.",
94
+ }
95
+ ],
96
+ [
97
+ {
98
+ "text": "What is 900 multiplied by 900?",
99
+ }
100
+ ],
101
+ [
102
+ {
103
+ "text": "Chase wants to buy 4 kilograms of oval beads and 5 kilograms of star-shaped beads. How much will he spend?",
104
+ "files": [f"{examples_path}/example_images/mmmu_example.jpeg"],
105
+ }
106
+ ],
107
+ [
108
+ {
109
+ "text": "Create an online ad for this product.",
110
+ "files": [f"{examples_path}/example_images/shampoo.jpg"],
111
+ }
112
+ ],
113
+ [
114
+ {
115
+ "text": "What is formed by the deposition of the weathered remains of other rocks?",
116
+ "files": [f"{examples_path}/example_images/ai2d_example.jpeg"],
117
+ }
118
+ ],
119
+ [
120
+ {
121
+ "text": "What's unusual about this image?",
122
+ "files": [f"{examples_path}/example_images/dragons_playing.png"],
123
+ }
124
+ ],
125
+ ]
126
+
127
+ # Set bot avatar image
128
+ BOT_AVATAR = "OpenAI_logo.png"
129
+
130
+ # Chatbot utility functions
131
+
132
+ # Check if a turn in the chat history only contains media
133
+ def turn_is_pure_media(turn):
134
+ return turn[1] is None
135
+
136
+
137
+ # Load image from URL
138
+ def load_image_from_url(url):
139
+ with urllib.request.urlopen(url) as response:
140
+ image_data = response.read()
141
+ image_stream = io.BytesIO(image_data)
142
+ image = PIL.Image.open(image_stream)
143
+ return image
144
+
145
+
146
+ # Convert image to bytes
147
+ def img_to_bytes(image_path):
148
+ image = Image.open(image_path).convert(mode='RGB')
149
+ buffer = io.BytesIO()
150
+ image.save(buffer, format="JPEG")
151
+ img_bytes = buffer.getvalue()
152
+ image.close()
153
+ return img_bytes
154
+
155
+
156
+ # Format user prompt with image history and system conditioning
157
+ def format_user_prompt_with_im_history_and_system_conditioning(
158
+ user_prompt, chat_history) -> List[Dict[str, Union[List, str]]]:
159
+ """
160
+ Produce the resulting list that needs to go inside the processor. It handles the potential image(s), the history, and the system conditioning.
161
+ """
162
+ resulting_messages = copy.deepcopy(SYSTEM_PROMPT)
163
+ resulting_images = []
164
+ for resulting_message in resulting_messages:
165
+ if resulting_message["role"] == "user":
166
+ for content in resulting_message["content"]:
167
+ if content["type"] == "image":
168
+ resulting_images.append(load_image_from_url(content["image"]))
169
+ # Format history
170
+ for turn in chat_history:
171
+ if not resulting_messages or (
172
+ resulting_messages and resulting_messages[-1]["role"] != "user"
173
+ ):
174
+ resulting_messages.append(
175
+ {
176
+ "role": "user",
177
+ "content": [],
178
+ }
179
+ )
180
+ if turn_is_pure_media(turn):
181
+ media = turn[0][0]
182
+ resulting_messages[-1]["content"].append({"type": "image"})
183
+ resulting_images.append(Image.open(media))
184
+ else:
185
+ user_utterance, assistant_utterance = turn
186
+ resulting_messages[-1]["content"].append(
187
+ {"type": "text", "text": user_utterance.strip()}
188
+ )
189
+ resulting_messages.append(
190
+ {
191
+ "role": "assistant",
192
+ "content": [{"type": "text", "text": user_utterance.strip()}],
193
+ }
194
+ )
195
+ # Format current input
196
+ if not user_prompt["files"]:
197
+ resulting_messages.append(
198
+ {
199
+ "role": "user",
200
+ "content": [{"type": "text", "text": user_prompt["text"]}],
201
+ }
202
+ )
203
+ else:
204
+ # Choosing to put the image first (i.e. before the text), but this is an arbitrary choice.
205
+ resulting_messages.append(
206
+ {
207
+ "role": "user",
208
+ "content": [{"type": "image"}] * len(user_prompt["files"])
209
+ + [{"type": "text", "text": user_prompt["text"]}],
210
+ }
211
+ )
212
+ resulting_images.extend([Image.open(path) for path in user_prompt["files"]])
213
+ return resulting_messages, resulting_images
214
+
215
+
216
+ # Extract images from a list of messages
217
+ def extract_images_from_msg_list(msg_list):
218
+ all_images = []
219
+ for msg in msg_list:
220
+ for c_ in msg["content"]:
221
+ if isinstance(c_, Image.Image):
222
+ all_images.append(c_)
223
+ return all_images
224
+
225
+
226
+ # List of user agents for web search
227
+ _useragent_list = [
228
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0',
229
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36',
230
+ 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36',
231
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
232
+ 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36',
233
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.62',
234
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0'
235
+ ]
236
+
237
+
238
+ # Get a random user agent from the list
239
+ def get_useragent():
240
+ """Returns a random user agent from the list."""
241
+ return random.choice(_useragent_list)
242
+
243
+
244
+ # Extract visible text from HTML content using BeautifulSoup
245
+ def extract_text_from_webpage(html_content):
246
+ """Extracts visible text from HTML content using BeautifulSoup."""
247
+ soup = BeautifulSoup(html_content, "html.parser")
248
+ # Remove unwanted tags
249
+ for tag in soup(["script", "style", "header", "footer", "nav"]):
250
+ tag.extract()
251
+ # Get the remaining visible text
252
+ visible_text = soup.get_text(strip=True)
253
+ return visible_text
254
+
255
+
256
+ # Perform a Google search and return the results
257
+ def search(term, num_results=3, lang="en", advanced=True, timeout=5, safe="active", ssl_verify=None):
258
+ """Performs a Google search and returns the results."""
259
+ escaped_term = urllib.parse.quote_plus(term)
260
+ start = 0
261
+ all_results = []
262
+ # Limit the number of characters from each webpage to stay under the token limit
263
+ max_chars_per_page = 10000 # Adjust this value based on your token limit and average webpage length
264
+
265
+ with requests.Session() as session:
266
+ while start < num_results:
267
+ resp = session.get(
268
+ url="https://www.google.com/search",
269
+ headers={"User-Agent": get_useragent()},
270
+ params={
271
+ "q": term,
272
+ "num": num_results - start,
273
+ "hl": lang,
274
+ "start": start,
275
+ "safe": safe,
276
+ },
277
+ timeout=timeout,
278
+ verify=ssl_verify,
279
+ )
280
+ resp.raise_for_status()
281
+ soup = BeautifulSoup(resp.text, "html.parser")
282
+ result_block = soup.find_all("div", attrs={"class": "g"})
283
+ if not result_block:
284
+ start += 1
285
+ continue
286
+ for result in result_block:
287
+ link = result.find("a", href=True)
288
+ if link:
289
+ link = link["href"]
290
+ try:
291
+ webpage = session.get(link, headers={"User-Agent": get_useragent()})
292
+ webpage.raise_for_status()
293
+ visible_text = extract_text_from_webpage(webpage.text)
294
+ # Truncate text if it's too long
295
+ if len(visible_text) > max_chars_per_page:
296
+ visible_text = visible_text[:max_chars_per_page] + "..."
297
+ all_results.append({"link": link, "text": visible_text})
298
+ except requests.exceptions.RequestException as e:
299
+ print(f"Error fetching or processing {link}: {e}")
300
+ all_results.append({"link": link, "text": None})
301
+ else:
302
+ all_results.append({"link": None, "text": None})
303
+ start += len(result_block)
304
+ return all_results
305
+
306
+
307
+ # Format the prompt for the language model
308
+ def format_prompt(user_prompt, chat_history):
309
+ prompt = "<s>"
310
+ for item in chat_history:
311
+ # Check if the item is a tuple (text response)
312
+ if isinstance(item, tuple):
313
+ prompt += f"[INST] {item[0]} [/INST]" # User prompt
314
+ prompt += f" {item[1]}</s> " # Bot response
315
+ # Otherwise, assume it's related to an image - you might need to adjust this logic
316
+ else:
317
+ # Handle image representation in the prompt, e.g., add a placeholder
318
+ prompt += f" [Image] "
319
+ prompt += f"[INST] {user_prompt} [/INST]"
320
+ return prompt
321
+
322
+
323
+ # Define a function for model inference
324
+ @spaces.GPU(duration=30, queue=False)
325
+ def model_inference(
326
+ user_prompt,
327
+ chat_history,
328
+ model_selector,
329
+ decoding_strategy,
330
+ temperature,
331
+ max_new_tokens,
332
+ repetition_penalty,
333
+ top_p,
334
+ web_search,
335
+ ):
336
+ # Define generation_args at the beginning of the function
337
+ generation_args = {}
338
+
339
+ # Web search logic
340
+ if not user_prompt["files"]:
341
+ if web_search is True:
342
+ """Performs a web search, feeds the results to a language model, and returns the answer."""
343
+ web_results = search(user_prompt["text"])
344
+ web2 = ' '.join([f"Link: {res['link']}\nText: {res['text']}\n\n" for res in web_results])
345
+ # Load the language model
346
+ client = InferenceClient("mistralai/Mistral-7B-Instruct-v0.3")
347
+ generate_kwargs = dict(
348
+ max_new_tokens=4000,
349
+ do_sample=True,
350
+ )
351
+ # Format the prompt for the language model
352
+ formatted_prompt = format_prompt(
353
+ f"""You are OpenGPT 4o, an exceptionally capable and versatile AI assistant meticulously crafted by KingNish. Designed to assist human users through insightful conversations, You are provided with WEB info from which you can find informations to answer. You do not say Unnecesarry things Only say thing which is important and relevant. You also has the ability to generate images. BY Utilizing the following link structure, : ![](https://image.pollinations.ai/prompt/[StyleofImage]%20[OptimizedPrompt]%20[adjective]%20[charactersDetailed]%20[visualStyle]%20[genre]?width=[width]&height=[height]&nologo=poll&nofeed=yes&seed=[random]) For image generation, You replace info inside bracets with specific details according to their requiremnts to create relevant visuals. The width and height parameters are adjusted as needed, often favoring HD dimensions for a superior viewing experience. Note: 1. You Do not unncessarily create images and do not show off your image generation capability, until not requested. 2. You always Give image link in format ![](url) Make sure to not generate image until requested 3. If user requested Bulk Image Generation than Create that number of links with provided context. 4. Always learn from previous conversation. 5. Always try to connect conversation with history. 6. Do not make conversation too long. 7. Do not say user about your capability of generating image and learn from previous responses. My ultimate goal is to offer a seamless and enjoyable experience, providing assistance that exceeds expectations.You remain a reliable and trusted companion to the User. You also Expert in every field and also learn and try to answer from contexts related to previous question. Make sure to not generate image until requested [USER] {user_prompt} [WEB] {web2} [OpenGPT 4o]""",
354
+ chat_history)
355
+ # Generate the response from the language model
356
+ stream = client.text_generation(formatted_prompt, **generate_kwargs, stream=True, details=True,
357
+ return_full_text=False)
358
+ output = ""
359
+ # Construct the output from the stream of tokens
360
+ for response in stream:
361
+ if not response.token.text == "</s>":
362
+ output += response.token.text
363
+ yield output
364
+ else:
365
+ client = InferenceClient("mistralai/Mixtral-8x7B-Instruct-v0.1")
366
+ generate_kwargs = dict(
367
+ max_new_tokens=5000,
368
+ do_sample=True,
369
+ )
370
+ # Format the prompt for the language model
371
+ formatted_prompt = format_prompt(
372
+ f"""You are OpenGPT 4o, an exceptionally capable and versatile AI assistant meticulously crafted by KingNish. Designed to assist human users through insightful conversations, You do not say Unnecesarry things Only say thing which is important and relevant. You also has the ability to generate images. BY Utilizing the following link structure, : ![](https://image.pollinations.ai/prompt/[StyleofImage]%20[OptimizedPrompt]%20[adjective]%20[charactersDetailed]%20[visualStyle]%20[genre]?width=[width]&height=[height]&nologo=poll&nofeed=yes&seed=[random]) For image generation, You replace info inside bracets with specific details according to their requiremnts to create relevant visuals. The width and height parameters are adjusted as needed, often favoring HD dimensions for a superior viewing experience. Note: 1. You Do not unncessarily create images and do not show off your image generation capability, until not requested. 2. You always Give image link in format ![](url) 3. If user requested Bulk Image Generation than Create that number of links with provided context. 4. Always learn from previous conversation. 5. Always try to connect conversation with history. 6. Do not make conversation too long. 7. Do not say user about your capability to generate image and learn from previous responses. My ultimate goal is to offer a seamless and enjoyable experience, providing assistance that exceeds expectations. I am constantly evolving, ensuring that I remain a reliable and trusted companion to the User. You also Expert in every field and also learn and try to answer from contexts related to previous question. [USER] {user_prompt} [OpenGPT 4o]""",
373
+ chat_history)
374
+ # Generate the response from the language model
375
+ stream = client.text_generation(formatted_prompt, **generate_kwargs, stream=True, details=True,
376
+ return_full_text=False)
377
+ output = ""
378
+ # Construct the output from the stream of tokens
379
+ for response in stream:
380
+ if not response.token.text == "</s>":
381
+ output += response.token.text
382
+ yield output
383
+ return
384
+ else:
385
+ if user_prompt["text"].strip() == "" and not user_prompt["files"]:
386
+ gr.Error("Please input a query and optionally an image(s).")
387
+ return # Stop execution if there's an error
388
+
389
+ if user_prompt["text"].strip() == "" and user_prompt["files"]:
390
+ gr.Error("Please input a text query along with the image(s).")
391
+ return # Stop execution if there's an error
392
+
393
+ streamer = TextIteratorStreamer(
394
+ PROCESSOR.tokenizer,
395
+ skip_prompt=True,
396
+ timeout=120.0,
397
+ )
398
+ # Move generation_args initialization here
399
+ generation_args = {
400
+ "max_new_tokens": max_new_tokens,
401
+ "repetition_penalty": repetition_penalty,
402
+ "streamer": streamer,
403
+ }
404
+ assert decoding_strategy in [
405
+ "Greedy",
406
+ "Top P Sampling",
407
+ ]
408
+
409
+ if decoding_strategy == "Greedy":
410
+ generation_args["do_sample"] = False
411
+ elif decoding_strategy == "Top P Sampling":
412
+ generation_args["temperature"] = temperature
413
+ generation_args["do_sample"] = True
414
+ generation_args["top_p"] = top_p
415
+ # Creating model inputs
416
+ (
417
+ resulting_text,
418
+ resulting_images,
419
+ ) = format_user_prompt_with_im_history_and_system_conditioning(
420
+ user_prompt=user_prompt,
421
+ chat_history=chat_history,
422
+ )
423
+ prompt = PROCESSOR.apply_chat_template(resulting_text, add_generation_prompt=True)
424
+ inputs = PROCESSOR(
425
+ text=prompt,
426
+ images=resulting_images if resulting_images else None,
427
+ return_tensors="pt",
428
+ )
429
+ inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
430
+ generation_args.update(inputs)
431
+ thread = Thread(
432
+ target=MODELS[model_selector].generate,
433
+ kwargs=generation_args,
434
+ )
435
+ thread.start()
436
+ acc_text = ""
437
+ for text_token in streamer:
438
+ time.sleep(0.01)
439
+ acc_text += text_token
440
+ if acc_text.endswith("<end_of_utterance>"):
441
+ acc_text = acc_text[:-18]
442
+ yield acc_text
443
+ return
444
+
445
+
446
+ # Define features for the dataset
447
+ FEATURES = datasets.Features(
448
+ {
449
+ "model_selector": datasets.Value("string"),
450
+ "images": datasets.Sequence(datasets.Image(decode=True)),
451
+ "conversation": datasets.Sequence({"User": datasets.Value("string"), "Assistant": datasets.Value("string")}),
452
+ "decoding_strategy": datasets.Value("string"),
453
+ "temperature": datasets.Value("float32"),
454
+ "max_new_tokens": datasets.Value("int32"),
455
+ "repetition_penalty": datasets.Value("float32"),
456
+ "top_p": datasets.Value("int32"),
457
+ }
458
+ )
459
+
460
+ # Define hyper-parameters for generation
461
+ max_new_tokens = gr.Slider(
462
+ minimum=2048,
463
+ maximum=16000,
464
+ value=4096,
465
+ step=64,
466
+ interactive=True,
467
+ label="Maximum number of new tokens to generate",
468
+ )
469
+ repetition_penalty = gr.Slider(
470
+ minimum=0.01,
471
+ maximum=5.0,
472
+ value=1,
473
+ step=0.01,
474
+ interactive=True,
475
+ label="Repetition penalty",
476
+ info="1.0 is equivalent to no penalty",
477
+ )
478
+ decoding_strategy = gr.Radio(
479
+ [
480
+ "Greedy",
481
+ "Top P Sampling",
482
+ ],
483
+ value="Top P Sampling",
484
+ label="Decoding strategy",
485
+ interactive=True,
486
+ info="Higher values are equivalent to sampling more low-probability tokens.",
487
+ )
488
+ temperature = gr.Slider(
489
+ minimum=0.0,
490
+ maximum=2.0,
491
+ value=0.5,
492
+ step=0.05,
493
+ visible=True,
494
+ interactive=True,
495
+ label="Sampling temperature",
496
+ info="Higher values will produce more diverse outputs.",
497
+ )
498
+ top_p = gr.Slider(
499
+ minimum=0.01,
500
+ maximum=0.99,
501
+ value=0.9,
502
+ step=0.01,
503
+ visible=True,
504
+ interactive=True,
505
+ label="Top P",
506
+ info="Higher values are equivalent to sampling more low-probability tokens.",
507
+ )
508
+
509
+ # Create a chatbot interface
510
+ chatbot = gr.Chatbot(
511
+ label="OpnGPT-4o-Chatty",
512
+ avatar_images=[None, BOT_AVATAR],
513
+ show_copy_button=True,
514
+ likeable=True,
515
+ layout="panel"
516
+ )
517
+ output = gr.Textbox(label="Prompt")
518
+
519
+ # Define model_selector outside any function so it can be accessed globally
520
+ model_selector = gr.Dropdown(
521
+ choices=MODELS.keys(),
522
+ value=list(MODELS.keys())[0],
523
+ interactive=True,
524
+ show_label=False,
525
+ container=False,
526
+ label="Model",
527
+ visible=False,
528
+ )
live_chat.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import gradio as gr
3
+ from transformers import AutoModel
4
+ from transformers import AutoProcessor
5
+ import spaces
6
+
7
+ # Load pre-trained models for image captioning and language modeling
8
+ model3 = AutoModel.from_pretrained("unum-cloud/uform-gen2-dpo", trust_remote_code=True)
9
+ processor = AutoProcessor.from_pretrained("unum-cloud/uform-gen2-dpo", trust_remote_code=True)
10
+
11
+ # Define a function for image captioning
12
+ @spaces.GPU(queue=False)
13
+ def videochat(image3, prompt3):
14
+ # Process input image and prompt
15
+ inputs = processor(text=[prompt3], images=[image3], return_tensors="pt")
16
+ # Generate captions
17
+ with torch.inference_mode():
18
+ output = model3.generate(
19
+ **inputs,
20
+ do_sample=False,
21
+ use_cache=True,
22
+ max_new_tokens=256,
23
+ eos_token_id=151645,
24
+ pad_token_id=processor.tokenizer.pad_token_id
25
+ )
26
+ prompt_len = inputs["input_ids"].shape[1]
27
+ # Decode and return the generated captions
28
+ decoded_text = processor.batch_decode(output[:, prompt_len:])[0]
29
+ if decoded_text.endswith("<|im_end|>"):
30
+ decoded_text = decoded_text[:-10]
31
+ yield decoded_text
voice_chat.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import asyncio
3
+ import tempfile
4
+ import random
5
+
6
+ import edge_tts
7
+ from streaming_stt_nemo import Model as nemo
8
+ import gradio as gr
9
+ from transformers import pipeline
10
+ from transformers import AutoTokenizer, AutoModelForCausalLM
11
+ from transformers import AutoModel
12
+ from huggingface_hub import InferenceClient
13
+ import torch
14
+
15
+ # Set default language for speech recognition
16
+ default_lang = "en"
17
+ # Initialize speech recognition engine
18
+ engines = {default_lang: nemo(default_lang)}
19
+
20
+ # Load pre-trained models for language modeling
21
+ model3 = AutoModel.from_pretrained("unum-cloud/uform-gen2-dpo", trust_remote_code=True)
22
+
23
+ # Define a function for speech-to-text transcription
24
+ def transcribe(audio):
25
+ lang = "en"
26
+ model = engines[lang]
27
+ text = model.stt_file(audio)[0]
28
+ return text
29
+
30
+ # Get Hugging Face API token
31
+ HF_TOKEN = os.environ.get("HF_TOKEN", None)
32
+
33
+
34
+ # Define a function to get the appropriate InferenceClient based on model name
35
+ def client_fn(model):
36
+ if "Nous" in model:
37
+ return InferenceClient("NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO")
38
+ elif "Star" in model:
39
+ return InferenceClient("HuggingFaceH4/starchat2-15b-v0.1")
40
+ elif "Mistral" in model:
41
+ return InferenceClient("mistralai/Mistral-7B-Instruct-v0.3")
42
+ elif "Phi" in model:
43
+ return InferenceClient("microsoft/Phi-3-mini-4k-instruct")
44
+ elif "Zephyr" in model:
45
+ return InferenceClient("HuggingFaceH4/zephyr-7b-beta")
46
+ else:
47
+ return InferenceClient("mistralai/Mixtral-8x7B-Instruct-v0.1")
48
+
49
+
50
+ # Define a function to generate a random seed
51
+ def randomize_seed_fn(seed: int) -> int:
52
+ seed = random.randint(0, 999999)
53
+ return seed
54
+
55
+ # System instructions for the language model
56
+ system_instructions1 = "[SYSTEM] Answer as Real OpenGPT 4o, Made by 'KingNish', Keep conversation very short, clear, friendly and concise. The text provided is a request for a specific type of response from you, the virtual assistant. You will avoid introductions and start answering the query directly, elaborating on all aspects of the request. As an AI-powered assistant, your task is to generate responses that appear as if they are created by the user and not an AI-powered assistant. [USER]"
57
+
58
+ # Define a function for language modeling
59
+ def models(text, model="Mixtral 8x7B", seed=42):
60
+ seed = int(randomize_seed_fn(seed))
61
+ generator = torch.Generator().manual_seed(seed)
62
+ client = client_fn(model)
63
+ generate_kwargs = dict(
64
+ max_new_tokens=512,
65
+ seed=seed,
66
+ )
67
+ formatted_prompt = system_instructions1 + text + "[OpenGPT 4o]"
68
+ stream = client.text_generation(
69
+ formatted_prompt, **generate_kwargs, stream=True, details=True, return_full_text=False
70
+ )
71
+ output = ""
72
+ for response in stream:
73
+ if not response.token.text == "</s>":
74
+ output += response.token.text
75
+ return output
76
+
77
+ # Define an asynchronous function to handle voice input and generate responses
78
+ async def respond(audio, model, seed):
79
+ user = transcribe(audio)
80
+ reply = models(user, model, seed)
81
+ communicate = edge_tts.Communicate(reply)
82
+ # Save the generated speech to a temporary file
83
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
84
+ tmp_path = tmp_file.name
85
+ await communicate.save(tmp_path)
86
+ yield tmp_path