Ubaidbhat commited on
Commit
c486906
1 Parent(s): 32940ab

voice added

Browse files
Files changed (2) hide show
  1. app.py +41 -41
  2. requirements.txt +2 -0
app.py CHANGED
@@ -1,7 +1,7 @@
1
  from pathlib import Path
2
  from openai import OpenAI
3
  import soundfile as sf
4
- # from pydub import AudioSegment
5
  import base64
6
  import logging
7
  import numpy as np
@@ -10,38 +10,38 @@ import os
10
  os.environ["OPENAI_API_KEY"] = "sk-proj-5dsm5f2bbRjgxAdWtE4yT3BlbkFJ6drh7Ilpp3EEVtBqETte"
11
  client = OpenAI()
12
 
13
- # # Set up logging
14
- # logging.basicConfig(level=logging.INFO)
15
- # def transform_text_to_speech(text: str):
16
- # # Generate speech from transcription
17
- # speech_file_path_mp3 = Path.cwd() / "speech.mp3"
18
- # speech_file_path_wav = Path.cwd() / "speech.wav"
19
- # response = client.audio.speech.create(
20
- # model="tts-1",
21
- # voice="onyx",
22
- # input=text
23
- # )
24
-
25
- # with open(speech_file_path_mp3, "wb") as f:
26
- # f.write(response.content)
27
-
28
- # # Convert mp3 to wav
29
- # audio = AudioSegment.from_mp3(speech_file_path_mp3)
30
- # audio.export(speech_file_path_wav, format="wav")
31
-
32
- # # Read the audio file and encode it to base64
33
- # with open(speech_file_path_wav, "rb") as audio_file:
34
- # audio_data = audio_file.read()
35
- # audio_base64 = base64.b64encode(audio_data).decode('utf-8')
36
-
37
- # # Create an HTML audio player with autoplay
38
- # audio_html = f"""
39
- # <audio controls autoplay>
40
- # <source src="data:audio/wav;base64,{audio_base64}" type="audio/wav">
41
- # Your browser does not support the audio element.
42
- # </audio>
43
- # """
44
- # return audio_html
45
 
46
 
47
  def transform_speech_to_text(audio):
@@ -197,7 +197,7 @@ def pred(image, input_text, audio):
197
 
198
  if input_text.lower().strip() == "flush":
199
  memory = ConversationBufferMemory(ai_prefix="old Pesron", human_prefix = "stud's terkel")
200
- return "Ready fot the new session", "Lets goo!"
201
 
202
 
203
  if new_photo_uploaded:
@@ -210,7 +210,7 @@ def pred(image, input_text, audio):
210
  res = get_image_informations(imagePath,conversation_prompt, memory, new_photo_uploaded)
211
  question = res["description"]
212
  new_photo_uploaded = False
213
- return "New Photo Uploaded", question
214
 
215
 
216
  if input_text.strip() != "":
@@ -223,11 +223,11 @@ def pred(image, input_text, audio):
223
  res = get_image_informations(imagePath, conversation_prompt, memory, new_photo_uploaded)
224
  question = res["question"]
225
  text = input_text
226
- return text, question
227
 
228
  if audio is None:
229
  message = "Please wait atleast 5 seconds after finishing your recording before submitting it to ensure it is fully captured. Thank you!"
230
- return message , message
231
 
232
  i += 1
233
  if i >= 2:
@@ -239,7 +239,7 @@ def pred(image, input_text, audio):
239
  memory.save_context({"input": question}, {"output": text})
240
  res = get_image_informations(imagePath, conversation_prompt, memory, new_photo_uploaded)
241
  question = res["question"]
242
- return text, question
243
 
244
  # Backend function to clear inputs
245
  def clear_inputs():
@@ -260,16 +260,16 @@ with gr.Blocks() as demo:
260
  # Output fields
261
  user_input_output = gr.Textbox(label="User Input")
262
  stud_output = gr.Textbox(label="Stud's Terkel")
263
- # audio_output = gr.HTML(label="Audio Player")
264
 
265
  with gr.Row():
266
  # Buttons at the bottom
267
  submit_button = gr.Button("Submit")
268
- clear_button = gr.Button("Clear", elem_id="clear-button")
269
 
270
  # Linking the submit button with the save_audio function
271
  submit_button.click(fn=pred, inputs=[image_input, text_input, audio_input],
272
- outputs=[user_input_output, stud_output])
273
 
274
  # Linking the clear button with the clear_inputs function
275
  clear_button.click(fn=clear_inputs, inputs=None, outputs=[image_input, text_input, audio_input])
 
1
  from pathlib import Path
2
  from openai import OpenAI
3
  import soundfile as sf
4
+ from pydub import AudioSegment
5
  import base64
6
  import logging
7
  import numpy as np
 
10
  os.environ["OPENAI_API_KEY"] = "sk-proj-5dsm5f2bbRjgxAdWtE4yT3BlbkFJ6drh7Ilpp3EEVtBqETte"
11
  client = OpenAI()
12
 
13
+ # Set up logging
14
+ logging.basicConfig(level=logging.INFO)
15
+ def transform_text_to_speech(text: str):
16
+ # Generate speech from transcription
17
+ speech_file_path_mp3 = Path.cwd() / "speech.mp3"
18
+ speech_file_path_wav = Path.cwd() / "speech.wav"
19
+ response = client.audio.speech.create(
20
+ model="tts-1",
21
+ voice="onyx",
22
+ input=text
23
+ )
24
+
25
+ with open(speech_file_path_mp3, "wb") as f:
26
+ f.write(response.content)
27
+
28
+ # Convert mp3 to wav
29
+ audio = AudioSegment.from_mp3(speech_file_path_mp3)
30
+ audio.export(speech_file_path_wav, format="wav")
31
+
32
+ # Read the audio file and encode it to base64
33
+ with open(speech_file_path_wav, "rb") as audio_file:
34
+ audio_data = audio_file.read()
35
+ audio_base64 = base64.b64encode(audio_data).decode('utf-8')
36
+
37
+ # Create an HTML audio player with autoplay
38
+ audio_html = f"""
39
+ <audio controls autoplay>
40
+ <source src="data:audio/wav;base64,{audio_base64}" type="audio/wav">
41
+ Your browser does not support the audio element.
42
+ </audio>
43
+ """
44
+ return audio_html
45
 
46
 
47
  def transform_speech_to_text(audio):
 
197
 
198
  if input_text.lower().strip() == "flush":
199
  memory = ConversationBufferMemory(ai_prefix="old Pesron", human_prefix = "stud's terkel")
200
+ return "Ready fot the new session", "Lets goo!", transform_text_to_speech("Ready fot the new session")
201
 
202
 
203
  if new_photo_uploaded:
 
210
  res = get_image_informations(imagePath,conversation_prompt, memory, new_photo_uploaded)
211
  question = res["description"]
212
  new_photo_uploaded = False
213
+ return "New Photo Uploaded", question, transform_text_to_speech(question)
214
 
215
 
216
  if input_text.strip() != "":
 
223
  res = get_image_informations(imagePath, conversation_prompt, memory, new_photo_uploaded)
224
  question = res["question"]
225
  text = input_text
226
+ return text, question, transform_text_to_speech(question)
227
 
228
  if audio is None:
229
  message = "Please wait atleast 5 seconds after finishing your recording before submitting it to ensure it is fully captured. Thank you!"
230
+ return message , message, transform_text_to_speech(message)
231
 
232
  i += 1
233
  if i >= 2:
 
239
  memory.save_context({"input": question}, {"output": text})
240
  res = get_image_informations(imagePath, conversation_prompt, memory, new_photo_uploaded)
241
  question = res["question"]
242
+ return text, question, transform_text_to_speech(question)
243
 
244
  # Backend function to clear inputs
245
  def clear_inputs():
 
260
  # Output fields
261
  user_input_output = gr.Textbox(label="User Input")
262
  stud_output = gr.Textbox(label="Stud's Terkel")
263
+ audio_output = gr.HTML(label="Audio Player")
264
 
265
  with gr.Row():
266
  # Buttons at the bottom
267
  submit_button = gr.Button("Submit")
268
+ clear_button = gr.Button("Upload ", elem_id="clear-button")
269
 
270
  # Linking the submit button with the save_audio function
271
  submit_button.click(fn=pred, inputs=[image_input, text_input, audio_input],
272
+ outputs=[user_input_output, stud_output, audio_output])
273
 
274
  # Linking the clear button with the clear_inputs function
275
  clear_button.click(fn=clear_inputs, inputs=None, outputs=[image_input, text_input, audio_input])
requirements.txt CHANGED
@@ -2,4 +2,6 @@ gradio
2
  OpenAI
3
  langchain
4
  langchain_openai
 
 
5
  soundfile
 
2
  OpenAI
3
  langchain
4
  langchain_openai
5
+ soundfile
6
+ simpleaudio
7
  soundfile