Spaces:
Sleeping
Sleeping
File size: 3,781 Bytes
1c66a80 ed0abb5 1c66a80 678aa20 1c66a80 5a85225 bd66273 0b6f717 bd66273 036a67e 12d3854 fcabb84 1f38dc2 fcabb84 1c66a80 fcabb84 c52635d fcabb84 12d3854 246aa34 5a85225 12d3854 08f3107 12d3854 d66b86d 12d3854 1c66a80 08f3107 1c66a80 08f3107 1c66a80 cefb089 e103d16 cefb089 1c66a80 cefb089 1c66a80 678aa20 1c66a80 246aa34 1c66a80 246aa34 1c66a80 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 |
import speech_recognition as sr
from pydub import AudioSegment
import gradio as gr
from os import path
import requests
import openai
prompt = "Type and press Enter"
def record_text(audio_file,api_key):
# model_id = 'whisper-1'
# media_file = open(audio_file, 'rb')
# response = openai.Audio.transcribe(
# api_key=api_key,
# model=model_id,
# file=media_file
# )
# return response.data['text']
response = openai.audio.transcriptions.create({
model: 'whisper-1',
api_key: api_key,
file: fs.createReadStream(audio_file),
});
return response.data['text']
# sound = audio_file
# sound_type = sound.split(".")
# if sound_type[-1] == 'mp3':
# input_file = sound
# output_file = "con_sound.wav"
# # convert mp3 file to wav file
# sound = AudioSegment.from_mp3(input_file)
# sound.export(output_file, format="wav")
# sound = "con_sound.wav"
# MyText = ""
# with sr.AudioFile(sound) as source:
# r.adjust_for_ambient_noise(source)
# print("Converting audio file to text..")
# audio2 = r.record(source, duration=None) # Use record instead of listen
# MyText = r.recognize_google(audio2, language="en-US", key=None, show_all=False)
# MyText = MyText.lower()
# return (MyText)
def api_calling(audio_file, prompt, api_key):
audio_text = record_text(audio_file,api_key)
if len(prompt) == 0:
prompt = "Apply proper punctuations, upper case and lower case to the provided text."
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {api_key}"
}
payload = {
"model": "gpt-3.5-turbo",
"messages": [
{
"role": "user",
"content": [
{
"type": "text",
"text": prompt
},
{
"type": "text",
"text": audio_text
}
]
}
],
"max_tokens": 1000
}
response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
audio_text_res = response.json()
return audio_text_res["choices"][0]["message"]["content"]
def message_and_history(audio_text,input, history, api_key):
history = history or []
output_text = api_calling(audio_text,input,api_key)
if len(input) == 0:
input = "Speech from the video."
history.append((input, output_text))
else:
history.append((input, output_text))
return history, history
block = gr.Blocks(theme=gr.themes.Glass(primary_hue="slate"))
with block:
gr.Markdown("""<h1><center>Audio Recognition - Ask and Learn about an Audio</center></h1> """)
with gr.Row():
with gr.Column(scale=0.5):
aud_input = gr.Audio(type="filepath", label="Upload Audio")
# audio_text = record_text(aud_input)
api_input = gr.Textbox(label="Enter Api-key")
upload_button = gr.Button(value="Upload & Start Chat", interactive=True, variant="primary")
with gr.Column():
chatbot = gr.Chatbot(label="Ask questions about the audio")
message = gr.Textbox(label="User", placeholder=prompt)
state = gr.State()
upload_button.click(message_and_history, inputs=[aud_input,message, state, api_input], outputs=[chatbot, state])
message.submit(message_and_history, inputs=[aud_input,message, state, api_input], outputs=[chatbot, state])
message.submit(lambda: None, None, message, queue=False)
block.launch(share=True) |