Spaces:

Vihang28
/

Audio_Recognition_QnA

Sleeping

File size: 3,568 Bytes

1c66a80
 
 
 
 
 
 
678aa20
 
1c66a80
5a85225
 
fcabb84
 
 
 
 
12d3854
fcabb84
 
 
 
 
1f38dc2
fcabb84
 
 
 
1c66a80
fcabb84
 
 
 
 
c52635d
fcabb84
 
 
12d3854
 
246aa34
5a85225
12d3854
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
08f3107
12d3854
 
 
 
 
 
 
 
d66b86d
12d3854
 
1c66a80
08f3107
1c66a80
08f3107
1c66a80
cefb089
e103d16
cefb089
1c66a80
cefb089
1c66a80
 
 
678aa20
1c66a80
 
 
 
 
 
246aa34
1c66a80
 
 
 
 
 
 
246aa34
 
1c66a80

import speech_recognition as sr
from pydub import AudioSegment
import gradio as gr
from os import path
import requests
import openai

prompt = "Type and press Enter"


def record_text(audio_file,api_key):
    openai.api_key = api_key
    audio_file = open(audio_file, "rb") 
    transcript = openai.Audio.transcribe("whisper-1", audio_file) 
    text = transcript['text']
    return text
    # r = sr.Recognizer()
    
    # sound = audio_file
    # sound_type = sound.split(".")
    # if sound_type[-1] == 'mp3':
    #     input_file = sound
    #     output_file = "con_sound.wav"
        
    #     # convert mp3 file to wav file 
    #     sound = AudioSegment.from_mp3(input_file) 
    #     sound.export(output_file, format="wav")
    #     sound = "con_sound.wav"
    
    # MyText = ""
    # with sr.AudioFile(sound) as source:
    #     r.adjust_for_ambient_noise(source)
    #     print("Converting audio file to text..")
    #     audio2 = r.record(source, duration=None)  # Use record instead of listen
        
    #     MyText = r.recognize_google(audio2, language="en-US", key=None, show_all=False)
    #     MyText = MyText.lower()
    # return (MyText)


def api_calling(audio_file, prompt, api_key):
    audio_text = record_text(audio_file,api_key)
    if len(prompt) == 0:
        prompt = "Apply proper punctuations, upper case and lower case to the provided text."
    
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {api_key}"
    }
    payload = {
        "model": "gpt-3.5-turbo",
        "messages": [
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": prompt
                    },
                    {
                        "type": "text",
                        "text": audio_text
                    }
                ]
            }
        ],
        "max_tokens": 1000
    }
    response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
    audio_text_res = response.json()
    return audio_text_res["choices"][0]["message"]["content"]



def message_and_history(audio_text,input, history, api_key):
    history = history or []
    output_text = api_calling(audio_text,input,api_key)
    
    if len(input) == 0:
        input = "Speech from the video."
        history.append((input, output_text))
    else:
        history.append((input, output_text))
    
    return history, history


block = gr.Blocks(theme=gr.themes.Glass(primary_hue="slate"))
with block:
    gr.Markdown("""<h1><center>Audio Recognition - Ask and Learn about an Audio</center></h1> """)
    with gr.Row():
        with gr.Column(scale=0.5):
            aud_input = gr.Audio(type="filepath", label="Upload Audio")
            # audio_text = record_text(aud_input)
            api_input = gr.Textbox(label="Enter Api-key")
            upload_button = gr.Button(value="Upload & Start Chat", interactive=True, variant="primary")
        with gr.Column():
            chatbot = gr.Chatbot(label="Ask questions about the audio")
            message = gr.Textbox(label="User", placeholder=prompt)
            state = gr.State()
            
    upload_button.click(message_and_history, inputs=[aud_input,message, state, api_input], outputs=[chatbot, state])
    message.submit(message_and_history, inputs=[aud_input,message, state, api_input], outputs=[chatbot, state])
    message.submit(lambda: None, None, message, queue=False)
block.launch(share=True)