Spaces:

Ubaidbhat
/

chatPhotos

Paused

App Files Files Community

Ubaidbhat commited on Jul 29

Commit

c486906

•

1 Parent(s): 32940ab

voice added

Browse files

Files changed (2) hide show

app.py +41 -41
requirements.txt +2 -0

app.py CHANGED Viewed

@@ -1,7 +1,7 @@
 from pathlib import Path
 from openai import OpenAI
 import soundfile as sf
-# from pydub import AudioSegment
 import base64
 import logging
 import numpy as np
@@ -10,38 +10,38 @@ import os
 os.environ["OPENAI_API_KEY"] = "sk-proj-5dsm5f2bbRjgxAdWtE4yT3BlbkFJ6drh7Ilpp3EEVtBqETte"
 client = OpenAI()
-# # Set up logging
-# logging.basicConfig(level=logging.INFO)
-# def transform_text_to_speech(text: str):
-#   # Generate speech from transcription
-#   speech_file_path_mp3 = Path.cwd() / "speech.mp3"
-#   speech_file_path_wav = Path.cwd() / "speech.wav"
-#   response = client.audio.speech.create(
-#                 model="tts-1",
-#                 voice="onyx",
-#                 input=text
-#             )
-#   with open(speech_file_path_mp3, "wb") as f:
-#       f.write(response.content)
-#   # Convert mp3 to wav
-#   audio = AudioSegment.from_mp3(speech_file_path_mp3)
-#   audio.export(speech_file_path_wav, format="wav")
-#   # Read the audio file and encode it to base64
-#   with open(speech_file_path_wav, "rb") as audio_file:
-#       audio_data = audio_file.read()
-#       audio_base64 = base64.b64encode(audio_data).decode('utf-8')
-#   # Create an HTML audio player with autoplay
-#   audio_html = f"""
-#   <audio controls autoplay>
-#       <source src="data:audio/wav;base64,{audio_base64}" type="audio/wav">
-#       Your browser does not support the audio element.
-#   </audio>
-#   """
-#   return audio_html
 def transform_speech_to_text(audio):
@@ -197,7 +197,7 @@ def pred(image, input_text, audio):
     if input_text.lower().strip() == "flush":
       memory = ConversationBufferMemory(ai_prefix="old Pesron", human_prefix = "stud's terkel")
-      return "Ready fot the new session", "Lets goo!"
     if new_photo_uploaded:
@@ -210,7 +210,7 @@ def pred(image, input_text, audio):
         res = get_image_informations(imagePath,conversation_prompt, memory, new_photo_uploaded)
         question = res["description"]
         new_photo_uploaded = False
-        return "New Photo Uploaded", question
     if input_text.strip() != "":
@@ -223,11 +223,11 @@ def pred(image, input_text, audio):
       res = get_image_informations(imagePath, conversation_prompt, memory, new_photo_uploaded)
       question = res["question"]
       text = input_text
-      return text, question
     if audio is None:
       message = "Please wait atleast 5 seconds after finishing your recording before submitting it to ensure it is fully captured. Thank you!"
-      return  message ,  message
     i += 1
     if i >= 2:
@@ -239,7 +239,7 @@ def pred(image, input_text, audio):
     memory.save_context({"input": question}, {"output": text})
     res = get_image_informations(imagePath, conversation_prompt, memory, new_photo_uploaded)
     question = res["question"]
-    return text, question
 # Backend function to clear inputs
 def clear_inputs():
@@ -260,16 +260,16 @@ with gr.Blocks() as demo:
             # Output fields
             user_input_output = gr.Textbox(label="User Input")
             stud_output = gr.Textbox(label="Stud's Terkel")
-            # audio_output = gr.HTML(label="Audio Player")
     with gr.Row():
         # Buttons at the bottom
         submit_button = gr.Button("Submit")
-        clear_button = gr.Button("Clear", elem_id="clear-button")
     # Linking the submit button with the save_audio function
     submit_button.click(fn=pred, inputs=[image_input, text_input, audio_input],
-                        outputs=[user_input_output, stud_output])
     # Linking the clear button with the clear_inputs function
     clear_button.click(fn=clear_inputs, inputs=None, outputs=[image_input, text_input, audio_input])

 from pathlib import Path
 from openai import OpenAI
 import soundfile as sf
+from pydub import AudioSegment
 import base64
 import logging
 import numpy as np
 os.environ["OPENAI_API_KEY"] = "sk-proj-5dsm5f2bbRjgxAdWtE4yT3BlbkFJ6drh7Ilpp3EEVtBqETte"
 client = OpenAI()
+# Set up logging
+logging.basicConfig(level=logging.INFO)
+def transform_text_to_speech(text: str):
+  # Generate speech from transcription
+  speech_file_path_mp3 = Path.cwd() / "speech.mp3"
+  speech_file_path_wav = Path.cwd() / "speech.wav"
+  response = client.audio.speech.create(
+                model="tts-1",
+                voice="onyx",
+                input=text
+            )
+  with open(speech_file_path_mp3, "wb") as f:
+      f.write(response.content)
+  # Convert mp3 to wav
+  audio = AudioSegment.from_mp3(speech_file_path_mp3)
+  audio.export(speech_file_path_wav, format="wav")
+  # Read the audio file and encode it to base64
+  with open(speech_file_path_wav, "rb") as audio_file:
+      audio_data = audio_file.read()
+      audio_base64 = base64.b64encode(audio_data).decode('utf-8')
+  # Create an HTML audio player with autoplay
+  audio_html = f"""
+  <audio controls autoplay>
+      <source src="data:audio/wav;base64,{audio_base64}" type="audio/wav">
+      Your browser does not support the audio element.
+  </audio>
+  """
+  return audio_html
 def transform_speech_to_text(audio):
     if input_text.lower().strip() == "flush":
       memory = ConversationBufferMemory(ai_prefix="old Pesron", human_prefix = "stud's terkel")
+      return "Ready fot the new session", "Lets goo!", transform_text_to_speech("Ready fot the new session")
     if new_photo_uploaded:
         res = get_image_informations(imagePath,conversation_prompt, memory, new_photo_uploaded)
         question = res["description"]
         new_photo_uploaded = False
+        return "New Photo Uploaded", question, transform_text_to_speech(question)
     if input_text.strip() != "":
       res = get_image_informations(imagePath, conversation_prompt, memory, new_photo_uploaded)
       question = res["question"]
       text = input_text
+      return text, question, transform_text_to_speech(question)
     if audio is None:
       message = "Please wait atleast 5 seconds after finishing your recording before submitting it to ensure it is fully captured. Thank you!"
+      return  message , message, transform_text_to_speech(message)
     i += 1
     if i >= 2:
     memory.save_context({"input": question}, {"output": text})
     res = get_image_informations(imagePath, conversation_prompt, memory, new_photo_uploaded)
     question = res["question"]
+    return text, question, transform_text_to_speech(question)
 # Backend function to clear inputs
 def clear_inputs():
             # Output fields
             user_input_output = gr.Textbox(label="User Input")
             stud_output = gr.Textbox(label="Stud's Terkel")
+            audio_output = gr.HTML(label="Audio Player")
     with gr.Row():
         # Buttons at the bottom
         submit_button = gr.Button("Submit")
+        clear_button = gr.Button("Upload ", elem_id="clear-button")
     # Linking the submit button with the save_audio function
     submit_button.click(fn=pred, inputs=[image_input, text_input, audio_input],
+                        outputs=[user_input_output, stud_output, audio_output])
     # Linking the clear button with the clear_inputs function
     clear_button.click(fn=clear_inputs, inputs=None, outputs=[image_input, text_input, audio_input])

requirements.txt CHANGED Viewed

@@ -2,4 +2,6 @@ gradio
 OpenAI
 langchain
 langchain_openai
 soundfile

 OpenAI
 langchain
 langchain_openai
+soundfile
+simpleaudio
 soundfile