import json import gradio as gr import requests import os def is_file_larger_than_30mb(file_path): try: file_size = os.path.getsize(file_path) return file_size > (30 * 1024 * 1024) except FileNotFoundError: return False except PermissionError: return False except Exception as e: return False def upload_audio(audio_path): try: size = is_file_larger_than_30mb(audio_path) if size == True: return 'size' with open(audio_path, 'rb') as audio_file: response = requests.post('http://sls-titan-9.csail.mit.edu:8080/upload/', files={'audio_file': audio_file}) if response.status_code == 200: return response.json()["path"] except: return None def predict(audio_path, question): upload_statues = upload_audio(audio_path) if upload_statues == None: return 'Please upload an audio file.' if upload_statues == 'size': return 'This demo does not support audio file size larger than 30MB.' if question == '': return 'Please ask a question.' print(audio_path, question) response = requests.put('http://sls-titan-9.csail.mit.edu:8080/items/0', json={ 'audio_path': audio_path, 'question': question }) answer = json.loads(response.content) ans_str = answer['output'] return ans_str if __name__ == '__main__': link = "https://github.com/YuanGongND/ltu" text = "[Github]" paper_link = "https://arxiv.org/pdf/2305.10790.pdf" paper_text = "[Paper]" sample_audio_link = "https://drive.google.com/drive/folders/17yeBevX0LIS1ugt0DZDOoJolwxvncMja?usp=sharing" sample_audio_text = "[sample audios from AudioSet evaluation set]" demo = gr.Interface(fn=predict, inputs=[gr.Audio(type="filepath"), gr.Textbox(value='What can be inferred from the audio? Why?', label='Edit the textbox to ask your own questions!')], outputs=[gr.Textbox(label="LTU Output")], cache_examples=True, title="Quick Demo of Listen, Think, and Understand (LTU)", description="LTU is a new audio model that bridges audio perception and advanced reasoning, it can answer any open-ended question about the given audio." + f"{paper_text} " + f"{text}
" + "LTU is authored by Yuan Gong, Hongyin Luo, Alexander H. Liu, Leonid Karlinsky, and James Glass (MIT & MIT-IBM Watson AI Lab).
" + "**Note LTU is not an ASR and has limited ability to recognize the speech content, it focuses on general audio perception and understanding.**
" + "Input an audio and ask quesions! Audio will be converted to 16kHz and padded or trim to 10 seconds. Don't have an audio sample on hand? Try some samples from AudioSet evaluation set: " + f"{sample_audio_text}
" + "**Research Demo, Not for Commercial Use (Due to license of LLaMA).**") demo.launch(debug=False, share=False)