Spaces:
Sleeping
Sleeping
File size: 3,992 Bytes
1c66a80 ed0abb5 f67682f 678aa20 1c66a80 5a85225 bb456ac 12d3854 246aa34 5a85225 12d3854 590d051 12d3854 1c66a80 08f3107 1c66a80 08f3107 1c66a80 cefb089 e103d16 cefb089 1c66a80 cefb089 1c66a80 678aa20 d75bbb9 1c66a80 2a7a0fd 1c66a80 32af635 e687694 1c66a80 246aa34 1c66a80 b91edc3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 |
import speech_recognition as sr
from pydub import AudioSegment
import gradio as gr
from os import path
import requests
import openai
from openai import OpenAI
prompt = "Type and press Enter"
def record_text(audio_file,api_key):
client = OpenAI(api_key = api_key)
input_file = audio_file
output_file = "converted_sound.mp3"
sound = AudioSegment.from_wav(input_file)
sound.export(output_file, format="mp3")
audio_file = "converted_sound.mp3"
audio_file = open(audio_file, "rb")
transcript = client.audio.transcriptions.create(
model="whisper-1",
file=audio_file,
response_format="text"
)
return transcript
# return(str(path.getsize(audio_file)/1000000)+'mb')
# sound = audio_file
# sound_type = sound.split(".")
# if sound_type[-1] == 'mp3':
# input_file = sound
# output_file = "con_sound.wav"
# # convert mp3 file to wav file
# sound = AudioSegment.from_mp3(input_file)
# sound.export(output_file, format="wav")
# sound = "con_sound.wav"
# MyText = ""
# with sr.AudioFile(sound) as source:
# r.adjust_for_ambient_noise(source)
# print("Converting audio file to text..")
# audio2 = r.record(source, duration=None) # Use record instead of listen
# MyText = r.recognize_google(audio2, language="en-US", key=None, show_all=False)
# MyText = MyText.lower()
# return (MyText)
def api_calling(audio_file, prompt, api_key):
audio_text = record_text(audio_file,api_key)
if len(prompt) == 0:
prompt = "Apply proper punctuations, upper case and lower case to the provided text."
return audio_text
else:
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {api_key}"
}
payload = {
"model": "gpt-3.5-turbo",
"messages": [
{
"role": "user",
"content": [
{
"type": "text",
"text": prompt
},
{
"type": "text",
"text": audio_text
}
]
}
],
"max_tokens": 1000
}
response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
audio_text_res = response.json()
return audio_text_res["choices"][0]["message"]["content"]
def message_and_history(audio_text,input, history, api_key):
history = history or []
output_text = api_calling(audio_text,input,api_key)
if len(input) == 0:
input = "Speech from the video."
history.append((input, output_text))
else:
history.append((input, output_text))
return history, history
block = gr.Blocks(theme=gr.themes.Monochrome(primary_hue="slate"))
with block:
gr.Markdown("""<h1><center>Audio Recognition - Ask & Learn about an Audio</center></h1> """)
with gr.Row():
with gr.Column(scale=0.5):
aud_input = gr.Audio(type="filepath", label="Upload .mp3 or .wav file", sources="upload")
api_input = gr.Textbox(label="Enter Api-key")
upload_button = gr.Button(value="Upload & Start Chat", interactive=True, variant="primary")
with gr.Column():
chatbot = gr.Chatbot(label="Ask questions about the audio")
message = gr.Textbox(label="User", placeholder=prompt)
state = gr.State()
upload_button.click(message_and_history, inputs=[aud_input,message, state, api_input], outputs=[chatbot, state])
message.submit(message_and_history, inputs=[aud_input,message, state, api_input], outputs=[chatbot, state])
message.submit(lambda: None, None, message, queue=False)
block.launch() |