Spaces:
Paused
Paused
voice added
Browse files- app.py +41 -41
- requirements.txt +2 -0
app.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
from pathlib import Path
|
2 |
from openai import OpenAI
|
3 |
import soundfile as sf
|
4 |
-
|
5 |
import base64
|
6 |
import logging
|
7 |
import numpy as np
|
@@ -10,38 +10,38 @@ import os
|
|
10 |
os.environ["OPENAI_API_KEY"] = "sk-proj-5dsm5f2bbRjgxAdWtE4yT3BlbkFJ6drh7Ilpp3EEVtBqETte"
|
11 |
client = OpenAI()
|
12 |
|
13 |
-
#
|
14 |
-
|
15 |
-
|
16 |
-
#
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
#
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
#
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
#
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
|
46 |
|
47 |
def transform_speech_to_text(audio):
|
@@ -197,7 +197,7 @@ def pred(image, input_text, audio):
|
|
197 |
|
198 |
if input_text.lower().strip() == "flush":
|
199 |
memory = ConversationBufferMemory(ai_prefix="old Pesron", human_prefix = "stud's terkel")
|
200 |
-
return "Ready fot the new session", "Lets goo!"
|
201 |
|
202 |
|
203 |
if new_photo_uploaded:
|
@@ -210,7 +210,7 @@ def pred(image, input_text, audio):
|
|
210 |
res = get_image_informations(imagePath,conversation_prompt, memory, new_photo_uploaded)
|
211 |
question = res["description"]
|
212 |
new_photo_uploaded = False
|
213 |
-
return "New Photo Uploaded", question
|
214 |
|
215 |
|
216 |
if input_text.strip() != "":
|
@@ -223,11 +223,11 @@ def pred(image, input_text, audio):
|
|
223 |
res = get_image_informations(imagePath, conversation_prompt, memory, new_photo_uploaded)
|
224 |
question = res["question"]
|
225 |
text = input_text
|
226 |
-
return text, question
|
227 |
|
228 |
if audio is None:
|
229 |
message = "Please wait atleast 5 seconds after finishing your recording before submitting it to ensure it is fully captured. Thank you!"
|
230 |
-
return message ,
|
231 |
|
232 |
i += 1
|
233 |
if i >= 2:
|
@@ -239,7 +239,7 @@ def pred(image, input_text, audio):
|
|
239 |
memory.save_context({"input": question}, {"output": text})
|
240 |
res = get_image_informations(imagePath, conversation_prompt, memory, new_photo_uploaded)
|
241 |
question = res["question"]
|
242 |
-
return text, question
|
243 |
|
244 |
# Backend function to clear inputs
|
245 |
def clear_inputs():
|
@@ -260,16 +260,16 @@ with gr.Blocks() as demo:
|
|
260 |
# Output fields
|
261 |
user_input_output = gr.Textbox(label="User Input")
|
262 |
stud_output = gr.Textbox(label="Stud's Terkel")
|
263 |
-
|
264 |
|
265 |
with gr.Row():
|
266 |
# Buttons at the bottom
|
267 |
submit_button = gr.Button("Submit")
|
268 |
-
clear_button = gr.Button("
|
269 |
|
270 |
# Linking the submit button with the save_audio function
|
271 |
submit_button.click(fn=pred, inputs=[image_input, text_input, audio_input],
|
272 |
-
outputs=[user_input_output, stud_output])
|
273 |
|
274 |
# Linking the clear button with the clear_inputs function
|
275 |
clear_button.click(fn=clear_inputs, inputs=None, outputs=[image_input, text_input, audio_input])
|
|
|
1 |
from pathlib import Path
|
2 |
from openai import OpenAI
|
3 |
import soundfile as sf
|
4 |
+
from pydub import AudioSegment
|
5 |
import base64
|
6 |
import logging
|
7 |
import numpy as np
|
|
|
10 |
os.environ["OPENAI_API_KEY"] = "sk-proj-5dsm5f2bbRjgxAdWtE4yT3BlbkFJ6drh7Ilpp3EEVtBqETte"
|
11 |
client = OpenAI()
|
12 |
|
13 |
+
# Set up logging
|
14 |
+
logging.basicConfig(level=logging.INFO)
|
15 |
+
def transform_text_to_speech(text: str):
|
16 |
+
# Generate speech from transcription
|
17 |
+
speech_file_path_mp3 = Path.cwd() / "speech.mp3"
|
18 |
+
speech_file_path_wav = Path.cwd() / "speech.wav"
|
19 |
+
response = client.audio.speech.create(
|
20 |
+
model="tts-1",
|
21 |
+
voice="onyx",
|
22 |
+
input=text
|
23 |
+
)
|
24 |
+
|
25 |
+
with open(speech_file_path_mp3, "wb") as f:
|
26 |
+
f.write(response.content)
|
27 |
+
|
28 |
+
# Convert mp3 to wav
|
29 |
+
audio = AudioSegment.from_mp3(speech_file_path_mp3)
|
30 |
+
audio.export(speech_file_path_wav, format="wav")
|
31 |
+
|
32 |
+
# Read the audio file and encode it to base64
|
33 |
+
with open(speech_file_path_wav, "rb") as audio_file:
|
34 |
+
audio_data = audio_file.read()
|
35 |
+
audio_base64 = base64.b64encode(audio_data).decode('utf-8')
|
36 |
+
|
37 |
+
# Create an HTML audio player with autoplay
|
38 |
+
audio_html = f"""
|
39 |
+
<audio controls autoplay>
|
40 |
+
<source src="data:audio/wav;base64,{audio_base64}" type="audio/wav">
|
41 |
+
Your browser does not support the audio element.
|
42 |
+
</audio>
|
43 |
+
"""
|
44 |
+
return audio_html
|
45 |
|
46 |
|
47 |
def transform_speech_to_text(audio):
|
|
|
197 |
|
198 |
if input_text.lower().strip() == "flush":
|
199 |
memory = ConversationBufferMemory(ai_prefix="old Pesron", human_prefix = "stud's terkel")
|
200 |
+
return "Ready fot the new session", "Lets goo!", transform_text_to_speech("Ready fot the new session")
|
201 |
|
202 |
|
203 |
if new_photo_uploaded:
|
|
|
210 |
res = get_image_informations(imagePath,conversation_prompt, memory, new_photo_uploaded)
|
211 |
question = res["description"]
|
212 |
new_photo_uploaded = False
|
213 |
+
return "New Photo Uploaded", question, transform_text_to_speech(question)
|
214 |
|
215 |
|
216 |
if input_text.strip() != "":
|
|
|
223 |
res = get_image_informations(imagePath, conversation_prompt, memory, new_photo_uploaded)
|
224 |
question = res["question"]
|
225 |
text = input_text
|
226 |
+
return text, question, transform_text_to_speech(question)
|
227 |
|
228 |
if audio is None:
|
229 |
message = "Please wait atleast 5 seconds after finishing your recording before submitting it to ensure it is fully captured. Thank you!"
|
230 |
+
return message , message, transform_text_to_speech(message)
|
231 |
|
232 |
i += 1
|
233 |
if i >= 2:
|
|
|
239 |
memory.save_context({"input": question}, {"output": text})
|
240 |
res = get_image_informations(imagePath, conversation_prompt, memory, new_photo_uploaded)
|
241 |
question = res["question"]
|
242 |
+
return text, question, transform_text_to_speech(question)
|
243 |
|
244 |
# Backend function to clear inputs
|
245 |
def clear_inputs():
|
|
|
260 |
# Output fields
|
261 |
user_input_output = gr.Textbox(label="User Input")
|
262 |
stud_output = gr.Textbox(label="Stud's Terkel")
|
263 |
+
audio_output = gr.HTML(label="Audio Player")
|
264 |
|
265 |
with gr.Row():
|
266 |
# Buttons at the bottom
|
267 |
submit_button = gr.Button("Submit")
|
268 |
+
clear_button = gr.Button("Upload ", elem_id="clear-button")
|
269 |
|
270 |
# Linking the submit button with the save_audio function
|
271 |
submit_button.click(fn=pred, inputs=[image_input, text_input, audio_input],
|
272 |
+
outputs=[user_input_output, stud_output, audio_output])
|
273 |
|
274 |
# Linking the clear button with the clear_inputs function
|
275 |
clear_button.click(fn=clear_inputs, inputs=None, outputs=[image_input, text_input, audio_input])
|
requirements.txt
CHANGED
@@ -2,4 +2,6 @@ gradio
|
|
2 |
OpenAI
|
3 |
langchain
|
4 |
langchain_openai
|
|
|
|
|
5 |
soundfile
|
|
|
2 |
OpenAI
|
3 |
langchain
|
4 |
langchain_openai
|
5 |
+
soundfile
|
6 |
+
simpleaudio
|
7 |
soundfile
|