File size: 3,287 Bytes
7a2502d
 
 
bfe831e
 
 
 
 
 
 
 
 
 
 
 
7a2502d
 
 
bfe831e
 
 
7a2502d
 
 
 
 
 
 
 
 
 
 
bfe831e
 
7a2502d
 
 
 
 
 
9a72e52
 
 
 
 
 
 
 
 
7a2502d
 
 
 
a626e80
b02fdb5
7a2502d
 
 
 
 
 
9a72e52
7a2502d
0674a08
 
 
9a72e52
b02fdb5
 
7a2502d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
import json
import gradio as gr
import requests
import os

def is_file_larger_than_30mb(file_path):
    try:
        file_size = os.path.getsize(file_path)
        return file_size > (30 * 1024 * 1024)
    except FileNotFoundError:
        return False
    except PermissionError:
        return False
    except Exception as e:
        return False

def upload_audio(audio_path):
    try:
        size = is_file_larger_than_30mb(audio_path)
        if size == True:
            return 'size'
        with open(audio_path, 'rb') as audio_file:
            response = requests.post('http://sls-titan-6.csail.mit.edu:8080/upload/', files={'audio_file': audio_file})
        if response.status_code == 200:
            return response.json()["path"]
    except:
        return None

def predict(audio_path, question):
    upload_statues = upload_audio(audio_path)
    if upload_statues == None:
        return 'Please upload an audio file.'
    if upload_statues == 'size':
        return 'This demo does not support audio file size larger than 30MB.'
    if question == '':
        return 'Please ask a question.'
    print(audio_path, question)
    response = requests.put('http://sls-titan-6.csail.mit.edu:8080/items/0', json={
        'audio_path': audio_path, 'question': question
    })
    answer_7b = json.loads(response.content)
    ans_str_7b = answer_7b['output']

    response = requests.put('http://sls-titan-5.csail.mit.edu:8080/items/0', json={
        'audio_path': audio_path, 'question': question
    })
    answer_13b = json.loads(response.content)
    ans_str_13b = answer_13b['output']
    return ans_str_7b, ans_str_13b

if __name__ == '__main__':
    link = "https://github.com/YuanGongND/ltu"
    text = "[Github]"
    paper_link = "https://arxiv.org/pdf/2309.14405.pdf"
    paper_text = "[ASRU Paper]"
    sample_audio_link = "https://drive.google.com/drive/folders/17yeBevX0LIS1ugt0DZDOoJolwxvncMja?usp=sharing"
    sample_audio_text = "[sample audios from AudioSet evaluation set]"
    demo = gr.Interface(fn=predict,
                        inputs=[gr.Audio(type="filepath"),
                                gr.Textbox(value='What can be inferred from the spoken text and sounds? Why?',
                                           label='Edit the textbox to ask your own questions!')],
                        outputs=[gr.Textbox(label="LTU-AS-7B Output"), gr.Textbox(label="LTU-AS-13B Output")],
                        cache_examples=True,
                        title="Demo of LTU-AS",
                        description="LTU-AS an improved version of LTU. LTU-AS is stronger in spoken text understanding and music understanding. " + f"<a href='{paper_link}'>{paper_text}</a> <br>" +
                                    "LTU-AS is authored by Yuan Gong, Alexander H. Liu, Hongyin Luo, Leonid Karlinsky, and James Glass (MIT & MIT-IBM Watson AI Lab). <br>" +
                                    "Input should be wav file sampled at 16kHz. This demo trims input audio to 10 seconds. <br>" +
                                    "Code of LTU-AS will be available soon at " + f"<a href='{link}'>{text}</a> <br>" +
                                    "**Research Demo, Not for Commercial Use (Due to license of LLaMA).**")
    demo.launch(debug=False, share=False)