yuangongfdu commited on
Commit
7cabb32
1 Parent(s): beed28a

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +97 -0
app.py ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import gradio as gr
3
+ import requests
4
+ import os
5
+
6
+ def is_file_larger_than_30mb(file_path):
7
+ try:
8
+ file_size = os.path.getsize(file_path)
9
+ return file_size > (300* 1024 * 1024)
10
+ except FileNotFoundError:
11
+ return False
12
+ except PermissionError:
13
+ return False
14
+ except Exception as e:
15
+ return False
16
+
17
+ def upload_audio(audio_path):
18
+ try:
19
+ size = is_file_larger_than_30mb(audio_path)
20
+ if size == True:
21
+ return 'size'
22
+ with open(audio_path, 'rb') as audio_file:
23
+ response = requests.post('http://sls-sm-14.csail.mit.edu:8080/upload/', files={'audio_file': audio_file})
24
+ if response.status_code == 200:
25
+ return response.json()["path"]
26
+ except:
27
+ return None
28
+
29
+ def upload_audio_13b(audio_path):
30
+ try:
31
+ size = is_file_larger_than_30mb(audio_path)
32
+ if size == True:
33
+ return 'size'
34
+ with open(audio_path, 'rb') as audio_file:
35
+ response = requests.post('http://sls-sm-15.csail.mit.edu:8080/upload/', files={'audio_file': audio_file})
36
+ if response.status_code == 200:
37
+ return response.json()["path"]
38
+ except:
39
+ return None
40
+
41
+ def predict(audio_path_m, audio_path_t, question, model):
42
+ if ((audio_path_m is None) != (audio_path_t is None)) == False:
43
+ return "Please upload and only upload one recording, either upload the audio file or record using microphone.", "Please upload and only upload one recording, either upload the audio file or record using microphone."
44
+ else:
45
+ audio_path = audio_path_m or audio_path_t
46
+ if model == '7B (Default)':
47
+ upload_statues = upload_audio(audio_path)
48
+ if upload_statues == None:
49
+ return 'Please upload an audio file.'
50
+ if upload_statues == 'size':
51
+ return 'This demo does not support audio file size larger than 30MB.'
52
+ if question == '':
53
+ return 'Please ask a question.'
54
+ print(audio_path, question)
55
+ response = requests.put('http://sls-sm-14.csail.mit.edu:8080/items/0', json={
56
+ 'audio_path': audio_path, 'question': question
57
+ })
58
+ answer_7b = json.loads(response.content)
59
+ ans_str_7b = answer_7b['output']
60
+ return ans_str_7b
61
+
62
+ if model == '13B (Beta)':
63
+ upload_statues = upload_audio_13b(audio_path)
64
+ if upload_statues == None:
65
+ return 'Please upload an audio file.'
66
+ if upload_statues == 'size':
67
+ return 'This demo does not support audio file size larger than 30MB.'
68
+ if question == '':
69
+ return 'Please ask a question.'
70
+ print(audio_path, question)
71
+ response = requests.put('http://sls-sm-15.csail.mit.edu:8080/items/0', json={
72
+ 'audio_path': audio_path, 'question': question
73
+ })
74
+ answer_13b = json.loads(response.content)
75
+ ans_str_13b = answer_13b['output']
76
+ return ans_str_13b
77
+
78
+ if __name__ == '__main__':
79
+ link = "https://github.com/YuanGongND/ltu"
80
+ text = "[Github]"
81
+ paper_link = "https://arxiv.org/pdf/2309.14405.pdf"
82
+ paper_text = "[ASRU Paper]"
83
+ sample_audio_link = "https://drive.google.com/drive/folders/17yeBevX0LIS1ugt0DZDOoJolwxvncMja?usp=sharing"
84
+ sample_audio_text = "[sample audios from AudioSet evaluation set]"
85
+ demo = gr.Interface(fn=predict,
86
+ inputs=[gr.Audio(type="filepath", source='microphone', label='Please either upload an audio file or record using the microphone.', show_label=True), gr.Audio(type="filepath"),
87
+ gr.Textbox(value='What can be inferred from the spoken text and sounds? Why?', label='Edit the textbox to ask your own questions!'),
88
+ gr.Radio(["7B (Default)", "13B (Beta)"], value='7B (Default)', label="LLM size", info="All experiments in the ASRU paper are 7B LLM.")],
89
+ outputs=[gr.Textbox(label="LTU-AS-Output")],
90
+ cache_examples=True,
91
+ title="Demo of LTU-AS",
92
+ description="LTU-AS an improved version of LTU. LTU-AS is stronger in spoken text understanding and music understanding. " + f"<a href='{paper_link}'>{paper_text}</a> <br>" +
93
+ "LTU-AS is authored by Yuan Gong, Alexander H. Liu, Hongyin Luo, Leonid Karlinsky, and James Glass (MIT & MIT-IBM Watson AI Lab). <br>" +
94
+ "Input should be wav file sampled at 16kHz. This demo trims input audio to 10 seconds. <br>" +
95
+ "Code of LTU-AS will be available soon at " + f"<a href='{link}'>{text}</a> <br>" +
96
+ "**Research Demo, Not for Commercial Use (Due to license of LLaMA).**")
97
+ demo.launch(debug=False, share=False)