Vihang28's picture
Update app.py
2a7a0fd verified
import speech_recognition as sr
from pydub import AudioSegment
import gradio as gr
from os import path
import requests
import openai
from openai import OpenAI
prompt = "Type and press Enter"
def record_text(audio_file,api_key):
client = OpenAI(api_key = api_key)
input_file = audio_file
output_file = "converted_sound.mp3"
sound = AudioSegment.from_wav(input_file)
sound.export(output_file, format="mp3")
audio_file = "converted_sound.mp3"
audio_file = open(audio_file, "rb")
transcript = client.audio.transcriptions.create(
model="whisper-1",
file=audio_file,
response_format="text"
)
return transcript
# return(str(path.getsize(audio_file)/1000000)+'mb')
# sound = audio_file
# sound_type = sound.split(".")
# if sound_type[-1] == 'mp3':
# input_file = sound
# output_file = "con_sound.wav"
# # convert mp3 file to wav file
# sound = AudioSegment.from_mp3(input_file)
# sound.export(output_file, format="wav")
# sound = "con_sound.wav"
# MyText = ""
# with sr.AudioFile(sound) as source:
# r.adjust_for_ambient_noise(source)
# print("Converting audio file to text..")
# audio2 = r.record(source, duration=None) # Use record instead of listen
# MyText = r.recognize_google(audio2, language="en-US", key=None, show_all=False)
# MyText = MyText.lower()
# return (MyText)
def api_calling(audio_file, prompt, api_key):
audio_text = record_text(audio_file,api_key)
if len(prompt) == 0:
prompt = "Apply proper punctuations, upper case and lower case to the provided text."
return audio_text
else:
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {api_key}"
}
payload = {
"model": "gpt-3.5-turbo",
"messages": [
{
"role": "user",
"content": [
{
"type": "text",
"text": prompt
},
{
"type": "text",
"text": audio_text
}
]
}
],
"max_tokens": 1000
}
response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
audio_text_res = response.json()
return audio_text_res["choices"][0]["message"]["content"]
def message_and_history(audio_text,input, history, api_key):
history = history or []
output_text = api_calling(audio_text,input,api_key)
if len(input) == 0:
input = "Speech from the video."
history.append((input, output_text))
else:
history.append((input, output_text))
return history, history
block = gr.Blocks(theme=gr.themes.Monochrome(primary_hue="slate"))
with block:
gr.Markdown("""<h1><center>Audio Recognition - Ask & Learn about an Audio</center></h1> """)
with gr.Row():
with gr.Column(scale=0.5):
aud_input = gr.Audio(type="filepath", label="Upload .mp3 or .wav file", sources="upload")
api_input = gr.Textbox(label="Enter Api-key")
upload_button = gr.Button(value="Upload & Start Chat", interactive=True, variant="primary")
with gr.Column():
chatbot = gr.Chatbot(label="Ask questions about the audio")
message = gr.Textbox(label="User", placeholder=prompt)
state = gr.State()
upload_button.click(message_and_history, inputs=[aud_input,message, state, api_input], outputs=[chatbot, state])
message.submit(message_and_history, inputs=[aud_input,message, state, api_input], outputs=[chatbot, state])
message.submit(lambda: None, None, message, queue=False)
block.launch()