File size: 3,337 Bytes
1c66a80
 
 
 
 
 
 
678aa20
 
1c66a80
12d3854
 
 
1c66a80
0aefd0d
1c66a80
 
 
1f38dc2
 
 
 
 
1c66a80
 
 
 
 
 
c52635d
 
 
12d3854
 
 
246aa34
 
12d3854
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
08f3107
12d3854
 
 
 
 
 
 
 
d66b86d
12d3854
 
1c66a80
08f3107
1c66a80
08f3107
1c66a80
cefb089
e103d16
cefb089
1c66a80
cefb089
1c66a80
 
 
678aa20
1c66a80
 
 
 
 
 
246aa34
1c66a80
 
 
 
 
 
 
246aa34
 
1c66a80
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import speech_recognition as sr
from pydub import AudioSegment
import gradio as gr
from os import path
import requests
import openai

prompt = "Type and press Enter"


def record_text(audio_file):
    r = sr.Recognizer()
    
    sound = audio_file
    sound_type = str_sound.split(".")
    if sound_type[-1] == 'mp3':
        input_file = sound
        output_file = "con_sound.wav"
        
        # convert mp3 file to wav file 
        sound = AudioSegment.from_mp3(input_file) 
        sound.export(output_file, format="wav")
        sound = "con_sound.wav"
    
    MyText = ""
    with sr.AudioFile(sound) as source:
        r.adjust_for_ambient_noise(source)
        print("Converting audio file to text..")
        audio2 = r.record(source, duration=None)  # Use record instead of listen
        
        MyText = r.recognize_google(audio2, language="en-US", key=None, show_all=False)
        MyText = MyText.lower()
    return (MyText)


def api_calling(audio_file, prompt, api_key):
    audio_text = record_text(audio_file)
    if len(prompt) == 0:
        prompt = "Apply proper punctuations, upper case and lower case to the provided text."
    
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {api_key}"
    }
    payload = {
        "model": "gpt-3.5-turbo",
        "messages": [
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": prompt
                    },
                    {
                        "type": "text",
                        "text": audio_text
                    }
                ]
            }
        ],
        "max_tokens": 1000
    }
    response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
    audio_text_res = response.json()
    return audio_text_res["choices"][0]["message"]["content"]



def message_and_history(audio_text,input, history, api_key):
    history = history or []
    output_text = api_calling(audio_text,input,api_key)
    
    if len(input) == 0:
        input = "Speech from the video."
        history.append((input, output_text))
    else:
        history.append((input, output_text))
    
    return history, history


block = gr.Blocks(theme=gr.themes.Glass(primary_hue="slate"))
with block:
    gr.Markdown("""<h1><center>Audio Recognition - Ask and Learn about an Audio</center></h1> """)
    with gr.Row():
        with gr.Column(scale=0.5):
            aud_input = gr.Audio(type="filepath", label="Upload Audio")
            # audio_text = record_text(aud_input)
            api_input = gr.Textbox(label="Enter Api-key")
            upload_button = gr.Button(value="Upload & Start Chat", interactive=True, variant="primary")
        with gr.Column():
            chatbot = gr.Chatbot(label="Ask questions about the audio")
            message = gr.Textbox(label="User", placeholder=prompt)
            state = gr.State()
            
    upload_button.click(message_and_history, inputs=[aud_input,message, state, api_input], outputs=[chatbot, state])
    message.submit(message_and_history, inputs=[aud_input,message, state, api_input], outputs=[chatbot, state])
    message.submit(lambda: None, None, message, queue=False)
block.launch(share=True)