import json import gradio as gr import requests import os def is_file_larger_than_30mb(file_path): try: file_size = os.path.getsize(file_path) return file_size > (30 * 1024 * 1024) except FileNotFoundError: return False except PermissionError: return False except Exception as e: return False def upload_audio(audio_path): try: size = is_file_larger_than_30mb(audio_path) if size == True: return 'size' with open(audio_path, 'rb') as audio_file: response = requests.post('http://sls-titan-6.csail.mit.edu:8080/upload/', files={'audio_file': audio_file}) if response.status_code == 200: return response.json()["path"] except: return None def upload_audio_ltu1(audio_path): try: size = is_file_larger_than_30mb(audio_path) if size == True: return 'size' with open(audio_path, 'rb') as audio_file: response = requests.post('http://sls-titan-4.csail.mit.edu:8080/upload/', files={'audio_file': audio_file}) if response.status_code == 200: return response.json()["path"] except: return None def predict(audio_path, question): upload_statues = upload_audio(audio_path) if upload_statues == None: return 'Please upload an audio file.' if upload_statues == 'size': return 'This demo does not support audio file size larger than 30MB.' if question == '': return 'Please ask a question.' print(audio_path, question) response = requests.put('http://sls-titan-6.csail.mit.edu:8080/items/0', json={ 'audio_path': audio_path, 'question': question }) answer = json.loads(response.content) ans_str_ltu2 = answer['output'] upload_statues = upload_audio_ltu1(audio_path) if upload_statues == None: return 'Please upload an audio file.' if upload_statues == 'size': return 'This demo does not support audio file size larger than 30MB.' if question == '': return 'Please ask a question.' print(audio_path, question) response = requests.put('http://sls-titan-4.csail.mit.edu:8080/items/0', json={ 'audio_path': audio_path, 'question': question }) answer = json.loads(response.content) ans_str_ltu = answer['output'] return ans_str_ltu, ans_str_ltu2 if __name__ == '__main__': link = "https://github.com/YuanGongND/ltu" text = "[Github]" paper_link = "https://arxiv.org/pdf/2309.14405.pdf" paper_text = "[ASRU Paper]" sample_audio_link = "https://drive.google.com/drive/folders/17yeBevX0LIS1ugt0DZDOoJolwxvncMja?usp=sharing" sample_audio_text = "[sample audios from AudioSet evaluation set]" demo = gr.Interface(fn=predict, inputs=[gr.Audio(type="filepath"), gr.Textbox(value='What can be inferred from the spoken text and sounds? Why?', label='Edit the textbox to ask your own questions!')], outputs=[gr.Textbox(label="LTU-1 Output"), gr.Textbox(label="LTU-AS Output")], cache_examples=True, title="Compare LTU-1 and LTU-AS", description="LTU-AS an improved version of LTU-1. LTU-AS is stronger in spoken text understanding and music understanding.
" + "This demo compares LTU-1 and LTU-AS.
" + "**Research Demo, Not for Commercial Use (Due to license of LLaMA).**") demo.launch(debug=False, share=False)