Spaces:

yuangongfdu
/

ltu-2

Running

App Files Files Community

ltu-2 / app.py

yuangongfdu

Update app.py

a626e80 9 months ago

raw history blame

No virus

2.99 kB

	import json
	import gradio as gr
	import requests
	import os

	def is_file_larger_than_30mb(file_path):
	try:
	file_size = os.path.getsize(file_path)
	return file_size > (30 * 1024 * 1024)
	except FileNotFoundError:
	return False
	except PermissionError:
	return False
	except Exception as e:
	return False

	def upload_audio(audio_path):
	try:
	size = is_file_larger_than_30mb(audio_path)
	if size == True:
	return 'size'
	with open(audio_path, 'rb') as audio_file:
	response = requests.post('http://sls-titan-6.csail.mit.edu:8080/upload/', files={'audio_file': audio_file})
	if response.status_code == 200:
	return response.json()["path"]
	except:
	return None

	def predict(audio_path, question):
	upload_statues = upload_audio(audio_path)
	if upload_statues == None:
	return 'Please upload an audio file.'
	if upload_statues == 'size':
	return 'This demo does not support audio file size larger than 30MB.'
	if question == '':
	return 'Please ask a question.'
	print(audio_path, question)
	response = requests.put('http://sls-titan-6.csail.mit.edu:8080/items/0', json={
	'audio_path': audio_path, 'question': question
	})
	answer = json.loads(response.content)
	ans_str = answer['output']
	return ans_str

	if __name__ == '__main__':
	link = "https://github.com/YuanGongND/ltu"
	text = "[Github]"
	paper_link = "https://arxiv.org/pdf/2309.14405.pdf"
	paper_text = "[ASRU Paper]"
	sample_audio_link = "https://drive.google.com/drive/folders/17yeBevX0LIS1ugt0DZDOoJolwxvncMja?usp=sharing"
	sample_audio_text = "[sample audios from AudioSet evaluation set]"
	demo = gr.Interface(fn=predict,
	inputs=[gr.Audio(type="filepath"),
	gr.Textbox(value='What can be inferred from the spoken text and sounds? Why?',
	label='Edit the textbox to ask your own questions!')],
	outputs=[gr.Textbox(label="LTU-AS Output")],
	cache_examples=True,
	title="Demo of LTU-AS",
	description="LTU-AS an improved version of LTU. LTU-AS is stronger in spoken text understanding and music understanding. " + f"<a href='{paper_link}'>{paper_text}</a> <br>" +
	"LTU-AS is authored by Yuan Gong, Alexander H. Liu, Hongyin Luo, Leonid Karlinsky, and James Glass (MIT & MIT-IBM Watson AI Lab). <br>" +
	"Input should be wav file sampled at 16kHz. This demo trims input audio to 10 seconds. <br>" +
	"Code of LTU-AS will be available soon at " + f"<a href='{link}'>{text}</a> <br>" +
	"Research Demo, Not for Commercial Use (Due to license of LLaMA).")
	demo.launch(debug=False, share=False)