Spaces:

yuangongfdu
/

ltu-2

Running

App Files Files Community

yuangongfdu commited on Sep 24, 2023

Commit

0674a08

•

1 Parent(s): bfe831e

Update app.py

Browse files

Files changed (1) hide show

app.py +5 -6

app.py CHANGED Viewed

@@ -45,7 +45,7 @@ def predict(audio_path, question):
 if __name__ == '__main__':
     link = "https://github.com/YuanGongND/ltu"
     text = "[Github]"
-    paper_link = "https://arxiv.org/pdf/2305.10790.pdf"
     paper_text = "[Paper]"
     sample_audio_link = "https://drive.google.com/drive/folders/17yeBevX0LIS1ugt0DZDOoJolwxvncMja?usp=sharing"
     sample_audio_text = "[sample audios from AudioSet evaluation set]"
@@ -53,12 +53,11 @@ if __name__ == '__main__':
                         inputs=[gr.Audio(type="filepath"),
                                 gr.Textbox(value='What can be inferred from the spoken text and sounds? Why?',
                                            label='Edit the textbox to ask your own questions!')],
-                        outputs=[gr.Textbox(label="LTU Output")],
                         cache_examples=True,
-                        title="Demo of LTU-2 Beta",
-                        description="LTU-2 an improved version of LTU. LTU-2 is stronger in spoken text understanding and music understanding. <br>" +
-                                    "LTU is authored by Yuan Gong, Alexander H. Liu, Hongyin Luo, Leonid Karlinsky, and James Glass (MIT & MIT-IBM Watson AI Lab). <br>" +
-                                    "**Please note that the model is under construction and may be buggy. It is trained with some new techniques that are not described in LTU paper. I.e., using method described in LTU paper cannot reproduce this model.**<br>" +
                                     "Input should be wav file sampled at 16kHz. This demo trim input audio to 10 seconds. <br>"
                                     "**Research Demo, No Commercial Use (Due to license of LLaMA).**")
     demo.launch(debug=False, share=False)

 if __name__ == '__main__':
     link = "https://github.com/YuanGongND/ltu"
     text = "[Github]"
+    paper_link = "https://www.researchgate.net/publication/374153208_Joint_Audio_and_Speech_Understanding"
     paper_text = "[Paper]"
     sample_audio_link = "https://drive.google.com/drive/folders/17yeBevX0LIS1ugt0DZDOoJolwxvncMja?usp=sharing"
     sample_audio_text = "[sample audios from AudioSet evaluation set]"
                         inputs=[gr.Audio(type="filepath"),
                                 gr.Textbox(value='What can be inferred from the spoken text and sounds? Why?',
                                            label='Edit the textbox to ask your own questions!')],
+                        outputs=[gr.Textbox(label="LTU-AS Output")],
                         cache_examples=True,
+                        title="Demo of LTU-AS",
+                        description="LTU-AS an improved version of LTU. LTU-AS is stronger in spoken text understanding and music understanding. " + f"<a href='{paper_link}'>{paper_text}</a> <br>" +
+                                    "LTU-AS is authored by Yuan Gong, Alexander H. Liu, Hongyin Luo, Leonid Karlinsky, and James Glass (MIT & MIT-IBM Watson AI Lab). <br>" +
                                     "Input should be wav file sampled at 16kHz. This demo trim input audio to 10 seconds. <br>"
                                     "**Research Demo, No Commercial Use (Due to license of LLaMA).**")
     demo.launch(debug=False, share=False)