Spaces:

Vihang28
/

Audio_Recognition_QnA

Sleeping

File size: 3,625 Bytes

1c66a80
 
 
 
 
ed0abb5
f67682f
 
678aa20
 
1c66a80
5a85225
93c16a2
f888789
97f56b5
 
ae21500
 
97f56b5
 
a855368
fcabb84
 
 
 
 
1f38dc2
fcabb84
 
 
 
1c66a80
fcabb84
 
 
 
 
c52635d
fcabb84
 
 
12d3854
 
246aa34
5a85225
12d3854
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
08f3107
12d3854
 
 
 
c1347d6
12d3854
 
 
d66b86d
12d3854
 
1c66a80
08f3107
1c66a80
08f3107
1c66a80
cefb089
e103d16
cefb089
1c66a80
cefb089
1c66a80
 
 
678aa20
1c66a80
 
 
 
 
 
246aa34
1c66a80
 
 
 
 
 
 
246aa34
 
1c66a80
b91edc3

import speech_recognition as sr
from pydub import AudioSegment
import gradio as gr
from os import path
import requests
import openai
from openai import OpenAI

prompt = "Type and press Enter"


def record_text(audio_file,api_key):
    client = OpenAI(api_key = api_key)
    audio_file = open(audio_file, "rb") 
    transcript = client.audio.transcriptions.create(
        model="whisper-1", 
        file=audio_file,
        response_format="text"
        ) 
    return transcript
    
    # sound = audio_file
    # sound_type = sound.split(".")
    # if sound_type[-1] == 'mp3':
    #     input_file = sound
    #     output_file = "con_sound.wav"
        
    #     # convert mp3 file to wav file 
    #     sound = AudioSegment.from_mp3(input_file) 
    #     sound.export(output_file, format="wav")
    #     sound = "con_sound.wav"
    
    # MyText = ""
    # with sr.AudioFile(sound) as source:
    #     r.adjust_for_ambient_noise(source)
    #     print("Converting audio file to text..")
    #     audio2 = r.record(source, duration=None)  # Use record instead of listen
        
    #     MyText = r.recognize_google(audio2, language="en-US", key=None, show_all=False)
    #     MyText = MyText.lower()
    # return (MyText)


def api_calling(audio_file, prompt, api_key):
    audio_text = record_text(audio_file,api_key)
    if len(prompt) == 0:
        prompt = "Apply proper punctuations, upper case and lower case to the provided text."
    
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {api_key}"
    }
    payload = {
        "model": "gpt-3.5-turbo",
        "messages": [
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": prompt
                    },
                    {
                        "type": "text",
                        "text": audio_text
                    }
                ]
            }
        ],
        "max_tokens": 1500
    }
    response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
    audio_text_res = response.json()
    return audio_text_res["choices"][0]["message"]["content"]



def message_and_history(audio_text,input, history, api_key):
    history = history or []
    output_text = api_calling(audio_text,input,api_key)
    
    if len(input) == 0:
        input = "Speech from the video."
        history.append((input, output_text))
    else:
        history.append((input, output_text))
    
    return history, history


block = gr.Blocks(theme=gr.themes.Glass(primary_hue="slate"))
with block:
    gr.Markdown("""<h1><center>Audio Recognition - Ask and Learn about an Audio</center></h1> """)
    with gr.Row():
        with gr.Column(scale=0.5):
            aud_input = gr.Audio(type="filepath", label="Upload Audio")
            # audio_text = record_text(aud_input)
            api_input = gr.Textbox(label="Enter Api-key")
            upload_button = gr.Button(value="Upload & Start Chat", interactive=True, variant="primary")
        with gr.Column():
            chatbot = gr.Chatbot(label="Ask questions about the audio")
            message = gr.Textbox(label="User", placeholder=prompt)
            state = gr.State()
            
    upload_button.click(message_and_history, inputs=[aud_input,message, state, api_input], outputs=[chatbot, state])
    message.submit(message_and_history, inputs=[aud_input,message, state, api_input], outputs=[chatbot, state])
    message.submit(lambda: None, None, message, queue=False)
block.launch()