Spaces:
Sleeping
Sleeping
import speech_recognition as sr | |
from pydub import AudioSegment | |
import gradio as gr | |
from os import path | |
import requests | |
import openai | |
from openai import OpenAI | |
prompt = "Type and press Enter" | |
def record_text(audio_file,api_key): | |
client = OpenAI(api_key = api_key) | |
input_file = audio_file | |
output_file = "converted_sound.mp3" | |
sound = AudioSegment.from_wav(input_file) | |
sound.export(output_file, format="mp3") | |
audio_file = "converted_sound.mp3" | |
audio_file = open(audio_file, "rb") | |
transcript = client.audio.transcriptions.create( | |
model="whisper-1", | |
file=audio_file, | |
response_format="text" | |
) | |
return transcript | |
# return(str(path.getsize(audio_file)/1000000)+'mb') | |
# sound = audio_file | |
# sound_type = sound.split(".") | |
# if sound_type[-1] == 'mp3': | |
# input_file = sound | |
# output_file = "con_sound.wav" | |
# # convert mp3 file to wav file | |
# sound = AudioSegment.from_mp3(input_file) | |
# sound.export(output_file, format="wav") | |
# sound = "con_sound.wav" | |
# MyText = "" | |
# with sr.AudioFile(sound) as source: | |
# r.adjust_for_ambient_noise(source) | |
# print("Converting audio file to text..") | |
# audio2 = r.record(source, duration=None) # Use record instead of listen | |
# MyText = r.recognize_google(audio2, language="en-US", key=None, show_all=False) | |
# MyText = MyText.lower() | |
# return (MyText) | |
def api_calling(audio_file, prompt, api_key): | |
audio_text = record_text(audio_file,api_key) | |
if len(prompt) == 0: | |
prompt = "Apply proper punctuations, upper case and lower case to the provided text." | |
return audio_text | |
else: | |
headers = { | |
"Content-Type": "application/json", | |
"Authorization": f"Bearer {api_key}" | |
} | |
payload = { | |
"model": "gpt-3.5-turbo", | |
"messages": [ | |
{ | |
"role": "user", | |
"content": [ | |
{ | |
"type": "text", | |
"text": prompt | |
}, | |
{ | |
"type": "text", | |
"text": audio_text | |
} | |
] | |
} | |
], | |
"max_tokens": 1000 | |
} | |
response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload) | |
audio_text_res = response.json() | |
return audio_text_res["choices"][0]["message"]["content"] | |
def message_and_history(audio_text,input, history, api_key): | |
history = history or [] | |
output_text = api_calling(audio_text,input,api_key) | |
if len(input) == 0: | |
input = "Speech from the video." | |
history.append((input, output_text)) | |
else: | |
history.append((input, output_text)) | |
return history, history | |
block = gr.Blocks(theme=gr.themes.Monochrome(primary_hue="slate")) | |
with block: | |
gr.Markdown("""<h1><center>Audio Recognition - Ask & Learn about an Audio</center></h1> """) | |
with gr.Row(): | |
with gr.Column(scale=0.5): | |
aud_input = gr.Audio(type="filepath", label="Upload .mp3 or .wav file", sources="upload") | |
api_input = gr.Textbox(label="Enter Api-key") | |
upload_button = gr.Button(value="Upload & Start Chat", interactive=True, variant="primary") | |
with gr.Column(): | |
chatbot = gr.Chatbot(label="Ask questions about the audio") | |
message = gr.Textbox(label="User", placeholder=prompt) | |
state = gr.State() | |
upload_button.click(message_and_history, inputs=[aud_input,message, state, api_input], outputs=[chatbot, state]) | |
message.submit(message_and_history, inputs=[aud_input,message, state, api_input], outputs=[chatbot, state]) | |
message.submit(lambda: None, None, message, queue=False) | |
block.launch() |