File size: 3,992 Bytes
1c66a80
 
 
 
 
ed0abb5
f67682f
 
678aa20
 
1c66a80
5a85225
bb456ac
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12d3854
 
246aa34
5a85225
12d3854
 
590d051
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12d3854
 
1c66a80
08f3107
1c66a80
08f3107
1c66a80
cefb089
e103d16
cefb089
1c66a80
cefb089
1c66a80
 
 
678aa20
d75bbb9
1c66a80
2a7a0fd
1c66a80
 
32af635
e687694
1c66a80
 
 
 
 
 
246aa34
 
1c66a80
b91edc3
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import speech_recognition as sr
from pydub import AudioSegment
import gradio as gr
from os import path
import requests
import openai
from openai import OpenAI

prompt = "Type and press Enter"


def record_text(audio_file,api_key):
    client = OpenAI(api_key = api_key)
    input_file = audio_file
    output_file = "converted_sound.mp3"
    sound = AudioSegment.from_wav(input_file) 
    sound.export(output_file, format="mp3")
    audio_file = "converted_sound.mp3"
    audio_file = open(audio_file, "rb") 
    transcript = client.audio.transcriptions.create(
        model="whisper-1", 
        file=audio_file,
        response_format="text"
        )
    return transcript
    # return(str(path.getsize(audio_file)/1000000)+'mb')
    # sound = audio_file
    # sound_type = sound.split(".")
    # if sound_type[-1] == 'mp3':
    #     input_file = sound
    #     output_file = "con_sound.wav"
        
    #     # convert mp3 file to wav file 
    #     sound = AudioSegment.from_mp3(input_file) 
    #     sound.export(output_file, format="wav")
    #     sound = "con_sound.wav"
    
    # MyText = ""
    # with sr.AudioFile(sound) as source:
    #     r.adjust_for_ambient_noise(source)
    #     print("Converting audio file to text..")
    #     audio2 = r.record(source, duration=None)  # Use record instead of listen
        
    #     MyText = r.recognize_google(audio2, language="en-US", key=None, show_all=False)
    #     MyText = MyText.lower()
    # return (MyText)


def api_calling(audio_file, prompt, api_key):
    audio_text = record_text(audio_file,api_key)
    if len(prompt) == 0:
        prompt = "Apply proper punctuations, upper case and lower case to the provided text."
        return audio_text
    else:
        headers = {
            "Content-Type": "application/json",
            "Authorization": f"Bearer {api_key}"
        }
        payload = {
            "model": "gpt-3.5-turbo",
            "messages": [
                {
                    "role": "user",
                    "content": [
                        {
                            "type": "text",
                            "text": prompt
                        },
                        {
                            "type": "text",
                            "text": audio_text
                        }
                    ]
                }
            ],
            "max_tokens": 1000
        }
        response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
        audio_text_res = response.json()
        return audio_text_res["choices"][0]["message"]["content"]



def message_and_history(audio_text,input, history, api_key):
    history = history or []
    output_text = api_calling(audio_text,input,api_key)
    
    if len(input) == 0:
        input = "Speech from the video."
        history.append((input, output_text))
    else:
        history.append((input, output_text))
    
    return history, history


block = gr.Blocks(theme=gr.themes.Monochrome(primary_hue="slate"))
with block:
    gr.Markdown("""<h1><center>Audio Recognition - Ask & Learn about an Audio</center></h1> """)
    with gr.Row():
        with gr.Column(scale=0.5):
            aud_input = gr.Audio(type="filepath", label="Upload .mp3 or .wav file", sources="upload")
            api_input = gr.Textbox(label="Enter Api-key")
            upload_button = gr.Button(value="Upload & Start Chat", interactive=True, variant="primary")
        with gr.Column():
            chatbot = gr.Chatbot(label="Ask questions about the audio")
            message = gr.Textbox(label="User", placeholder=prompt)
            state = gr.State()
            
    upload_button.click(message_and_history, inputs=[aud_input,message, state, api_input], outputs=[chatbot, state])
    message.submit(message_and_history, inputs=[aud_input,message, state, api_input], outputs=[chatbot, state])
    message.submit(lambda: None, None, message, queue=False)
block.launch()