Spaces:

ZebangCheng
/

Emotion-LLaMA

Running on Zero

App Files Files Community

Emotion-LLaMA / app.py

ZebangCheng

zerogpu test

9cc1833 20 days ago

raw

history blame contribute delete

26.9 kB

	import spaces


	import argparse
	import os
	import random
	from collections import defaultdict
	import cv2
	import re
	import numpy as np
	from PIL import Image
	import torch
	import html
	import gradio as gr
	import torchvision.transforms as T
	import torch.backends.cudnn as cudnn
	from minigpt4.common.config import Config
	from minigpt4.common.registry import registry
	from minigpt4.conversation.conversation import Conversation, SeparatorStyle, Chat
	# imports modules for registration
	from minigpt4.datasets.builders import *
	from minigpt4.models import *
	from minigpt4.processors import *
	from minigpt4.runners import *
	from minigpt4.tasks import *

	import socket
	import os


	def parse_args():
	parser = argparse.ArgumentParser(description="Demo")
	parser.add_argument("--cfg-path", default='eval_configs/demo.yaml',
	help="path to configuration file.")
	parser.add_argument(
	"--options",
	nargs="+",
	help="override some settings in the used config, the key-value pair "
	"in xxx=yyy format will be merged into config file (deprecate), "
	"change to --cfg-options instead.",
	)
	args = parser.parse_args()
	return args


	random.seed(42)
	np.random.seed(42)
	torch.manual_seed(42)

	cudnn.benchmark = False
	cudnn.deterministic = True

	print('Initializing Chat')
	args = parse_args()
	cfg = Config(args)

	device = 'cuda'

	model_config = cfg.model_cfg

	print("model_config:", model_config)
	model_cls = registry.get_model_class(model_config.arch)
	model = model_cls.from_config(model_config).to(device)
	bounding_box_size = 100

	vis_processor_cfg = cfg.datasets_cfg.feature_face_caption.vis_processor.train
	vis_processor = registry.get_processor_class(vis_processor_cfg.name).from_config(vis_processor_cfg)

	model = model.eval()

	CONV_VISION = Conversation(
	system="",
	roles=(r"<s>[INST] ", r" [/INST]"),
	messages=[],
	offset=2,
	sep_style=SeparatorStyle.SINGLE,
	sep="",
	)


	def extract_substrings(string):
	# first check if there is no-finished bracket
	index = string.rfind('}')
	if index != -1:
	string = string[:index + 1]

	pattern = r'<p>(.*?)\}(?!<)'
	matches = re.findall(pattern, string)
	substrings = [match for match in matches]

	return substrings


	def is_overlapping(rect1, rect2):
	x1, y1, x2, y2 = rect1
	x3, y3, x4, y4 = rect2
	return not (x2 < x3 or x1 > x4 or y2 < y3 or y1 > y4)


	def computeIoU(bbox1, bbox2):
	x1, y1, x2, y2 = bbox1
	x3, y3, x4, y4 = bbox2
	intersection_x1 = max(x1, x3)
	intersection_y1 = max(y1, y3)
	intersection_x2 = min(x2, x4)
	intersection_y2 = min(y2, y4)
	intersection_area = max(0, intersection_x2 - intersection_x1 + 1) * max(0, intersection_y2 - intersection_y1 + 1)
	bbox1_area = (x2 - x1 + 1) * (y2 - y1 + 1)
	bbox2_area = (x4 - x3 + 1) * (y4 - y3 + 1)
	union_area = bbox1_area + bbox2_area - intersection_area
	iou = intersection_area / union_area
	return iou


	def save_tmp_img(visual_img):
	file_name = "".join([str(random.randint(0, 9)) for _ in range(5)]) + ".jpg"
	file_path = "/tmp/gradio" + file_name
	visual_img.save(file_path)
	return file_path


	def mask2bbox(mask):
	if mask is None:
	return ''
	mask = mask.resize([100, 100], resample=Image.NEAREST)
	mask = np.array(mask)[:, :, 0]

	rows = np.any(mask, axis=1)
	cols = np.any(mask, axis=0)

	if rows.sum():
	# Get the top, bottom, left, and right boundaries
	rmin, rmax = np.where(rows)[0][[0, -1]]
	cmin, cmax = np.where(cols)[0][[0, -1]]
	bbox = '{{<{}><{}><{}><{}>}}'.format(cmin, rmin, cmax, rmax)
	else:
	bbox = ''

	return bbox


	def escape_markdown(text):
	# List of Markdown special characters that need to be escaped
	md_chars = ['<', '>']

	# Escape each special character
	for char in md_chars:
	text = text.replace(char, '\\' + char)

	return text


	def reverse_escape(text):
	# Add safety check for None values
	if text is None:
	return ""

	md_chars = ['\\<', '\\>']

	for char in md_chars:
	text = text.replace(char, char[1:])

	return text


	colors = [
	(255, 0, 0),
	(0, 255, 0),
	(0, 0, 255),
	(210, 210, 0),
	(255, 0, 255),
	(0, 255, 255),
	(114, 128, 250),
	(0, 165, 255),
	(0, 128, 0),
	(144, 238, 144),
	(238, 238, 175),
	(255, 191, 0),
	(0, 128, 0),
	(226, 43, 138),
	(255, 0, 255),
	(0, 215, 255),
	]

	color_map = {
	f"{color_id}": f"#{hex(color[2])[2:].zfill(2)}{hex(color[1])[2:].zfill(2)}{hex(color[0])[2:].zfill(2)}" for
	color_id, color in enumerate(colors)
	}

	used_colors = colors

	def get_first_frame(video_path):
	cap = cv2.VideoCapture(video_path)

	if not cap.isOpened():
	print("Error: Cannot open video.")
	return None

	ret, frame = cap.read()
	cap.release()

	if ret:
	return frame
	else:
	print("Error: Cannot read frame from video.")
	return None

	def visualize_all_bbox_together(image, generation):
	if image is None:
	return None, ''

	if isinstance(image, str): # is a image path
	raw_image = get_first_frame(image)
	if raw_image is None:
	return None, ''
	frame_rgb = cv2.cvtColor(raw_image, cv2.COLOR_BGR2RGB)
	image = Image.fromarray(frame_rgb)

	generation = html.unescape(generation)

	image_width, image_height = image.size
	image = image.resize([500, int(500 / image_width * image_height)])
	image_width, image_height = image.size

	string_list = extract_substrings(generation)
	if string_list: # it is grounding or detection
	mode = 'all'
	entities = defaultdict(list)
	i = 0
	j = 0
	for string in string_list:
	try:
	obj, string = string.split('</p>')
	except ValueError:
	print('wrong string: ', string)
	continue
	bbox_list = string.split('<delim>')
	flag = False
	for bbox_string in bbox_list:
	integers = re.findall(r'-?\d+', bbox_string)
	if len(integers) == 4:
	x0, y0, x1, y1 = int(integers[0]), int(integers[1]), int(integers[2]), int(integers[3])
	left = x0 / bounding_box_size * image_width
	bottom = y0 / bounding_box_size * image_height
	right = x1 / bounding_box_size * image_width
	top = y1 / bounding_box_size * image_height

	entities[obj].append([left, bottom, right, top])

	j += 1
	flag = True
	if flag:
	i += 1
	else:
	integers = re.findall(r'-?\d+', generation)

	if len(integers) == 4: # it is refer
	mode = 'single'

	entities = list()
	x0, y0, x1, y1 = int(integers[0]), int(integers[1]), int(integers[2]), int(integers[3])
	left = x0 / bounding_box_size * image_width
	bottom = y0 / bounding_box_size * image_height
	right = x1 / bounding_box_size * image_width
	top = y1 / bounding_box_size * image_height
	entities.append([left, bottom, right, top])
	else:
	# don't detect any valid bbox to visualize
	return None, ''

	if len(entities) == 0:
	return None, ''

	if isinstance(image, Image.Image):
	image_h = image.height
	image_w = image.width
	image = np.array(image)

	elif isinstance(image, str):
	if os.path.exists(image):
	pil_img = Image.open(image).convert("RGB")
	image = np.array(pil_img)[:, :, [2, 1, 0]]
	image_h = pil_img.height
	image_w = pil_img.width
	else:
	raise ValueError(f"invaild image path, {image}")
	elif isinstance(image, torch.Tensor):

	image_tensor = image.cpu()
	reverse_norm_mean = torch.tensor([0.48145466, 0.4578275, 0.40821073])[:, None, None]
	reverse_norm_std = torch.tensor([0.26862954, 0.26130258, 0.27577711])[:, None, None]
	image_tensor = image_tensor * reverse_norm_std + reverse_norm_mean
	pil_img = T.ToPILImage()(image_tensor)
	image_h = pil_img.height
	image_w = pil_img.width
	image = np.array(pil_img)[:, :, [2, 1, 0]]
	else:
	raise ValueError(f"invaild image format, {type(image)} for {image}")

	indices = list(range(len(entities)))

	new_image = image.copy()

	previous_bboxes = []
	# size of text
	text_size = 0.5
	# thickness of text
	text_line = 1 # int(max(1 * min(image_h, image_w) / 512, 1))
	box_line = 2
	(c_width, text_height), _ = cv2.getTextSize("F", cv2.FONT_HERSHEY_COMPLEX, text_size, text_line)
	base_height = int(text_height * 0.675)
	text_offset_original = text_height - base_height
	text_spaces = 2

	# num_bboxes = sum(len(x[-1]) for x in entities)
	used_colors = colors # random.sample(colors, k=num_bboxes)

	color_id = -1
	for entity_idx, entity_name in enumerate(entities):
	if mode == 'single' or mode == 'identify':
	bboxes = entity_name
	bboxes = [bboxes]
	else:
	bboxes = entities[entity_name]
	color_id += 1
	for bbox_id, (x1_norm, y1_norm, x2_norm, y2_norm) in enumerate(bboxes):
	skip_flag = False
	orig_x1, orig_y1, orig_x2, orig_y2 = int(x1_norm), int(y1_norm), int(x2_norm), int(y2_norm)

	color = used_colors[entity_idx % len(used_colors)] # tuple(np.random.randint(0, 255, size=3).tolist())
	new_image = cv2.rectangle(new_image, (orig_x1, orig_y1), (orig_x2, orig_y2), color, box_line)

	if mode == 'all':
	l_o, r_o = box_line // 2 + box_line % 2, box_line // 2 + box_line % 2 + 1

	x1 = orig_x1 - l_o
	y1 = orig_y1 - l_o

	if y1 < text_height + text_offset_original + 2 * text_spaces:
	y1 = orig_y1 + r_o + text_height + text_offset_original + 2 * text_spaces
	x1 = orig_x1 + r_o

	# add text background
	(text_width, text_height), _ = cv2.getTextSize(f" {entity_name}", cv2.FONT_HERSHEY_COMPLEX, text_size,
	text_line)
	text_bg_x1, text_bg_y1, text_bg_x2, text_bg_y2 = x1, y1 - (
	text_height + text_offset_original + 2 * text_spaces), x1 + text_width, y1

	for prev_bbox in previous_bboxes:
	if computeIoU((text_bg_x1, text_bg_y1, text_bg_x2, text_bg_y2), prev_bbox['bbox']) > 0.95 and \
	prev_bbox['phrase'] == entity_name:
	skip_flag = True
	break
	while is_overlapping((text_bg_x1, text_bg_y1, text_bg_x2, text_bg_y2), prev_bbox['bbox']):
	text_bg_y1 += (text_height + text_offset_original + 2 * text_spaces)
	text_bg_y2 += (text_height + text_offset_original + 2 * text_spaces)
	y1 += (text_height + text_offset_original + 2 * text_spaces)

	if text_bg_y2 >= image_h:
	text_bg_y1 = max(0, image_h - (text_height + text_offset_original + 2 * text_spaces))
	text_bg_y2 = image_h
	y1 = image_h
	break
	if not skip_flag:
	alpha = 0.5
	for i in range(text_bg_y1, text_bg_y2):
	for j in range(text_bg_x1, text_bg_x2):
	if i < image_h and j < image_w:
	if j < text_bg_x1 + 1.35 * c_width:
	# original color
	bg_color = color
	else:
	# white
	bg_color = [255, 255, 255]
	new_image[i, j] = (alpha * new_image[i, j] + (1 - alpha) * np.array(bg_color)).astype(
	np.uint8)

	cv2.putText(
	new_image, f" {entity_name}", (x1, y1 - text_offset_original - 1 * text_spaces),
	cv2.FONT_HERSHEY_COMPLEX, text_size, (0, 0, 0), text_line, cv2.LINE_AA
	)

	previous_bboxes.append(
	{'bbox': (text_bg_x1, text_bg_y1, text_bg_x2, text_bg_y2), 'phrase': entity_name})

	if mode == 'all':
	def color_iterator(colors):
	while True:
	for color in colors:
	yield color

	color_gen = color_iterator(colors)

	# Add colors to phrases and remove <p></p>
	def colored_phrases(match):
	phrase = match.group(1)
	color = next(color_gen)
	return f'<span style="color:rgb{color}">{phrase}</span>'

	generation = re.sub(r'{<\d+><\d+><\d+><\d+>}\|<delim>', '', generation)
	generation_colored = re.sub(r'<p>(.*?)</p>', colored_phrases, generation)
	else:
	generation_colored = ''

	pil_image = Image.fromarray(new_image)
	return pil_image, generation_colored


	def gradio_reset(chat_state, img_list):
	if chat_state is not None:
	chat_state.messages = []
	if img_list is not None:
	img_list = []
	return None, gr.update(value=None, interactive=True), gr.update(placeholder='Upload your image and chat',
	interactive=True), chat_state, img_list


	def image_upload_trigger(gr_img, upload_flag, replace_flag, img_list):
	# set the upload flag to true when receive a new image.
	# if there is an old image (and old conversation), set the replace flag to true to reset the conv later.
	print(f"Image upload triggered: {gr_img}")
	upload_flag = 1
	if img_list:
	replace_flag = 1
	return upload_flag, replace_flag


	def gradio_ask(user_message, chatbot, chat_state, gr_img, img_list, upload_flag, replace_flag):
	print("+++gradio_ask+++")
	print(f"gr_img: {gr_img}, type: {type(gr_img)}")
	print(f"upload_flag: {upload_flag}, replace_flag: {replace_flag}")

	if len(user_message) == 0:
	text_box_show = 'Input should not be empty!'
	else:
	text_box_show = ''

	print('user_message:', user_message)
	print('chatbot:', chatbot)
	print('chat_state:', chat_state)

	if isinstance(gr_img, dict):
	gr_img, mask = gr_img['image'], gr_img['mask']
	else:
	mask = None

	if '[identify]' in user_message:
	# check if user provide bbox in the text input
	integers = re.findall(r'-?\d+', user_message)
	if len(integers) != 4: # no bbox in text
	bbox = mask2bbox(mask)
	user_message = user_message + bbox

	if chat_state is None:
	chat_state = CONV_VISION.copy()

	# Always process the image if it exists and upload_flag is set or img_list is empty
	if gr_img is not None and (upload_flag or len(img_list) == 0):
	if replace_flag:
	chat_state = CONV_VISION.copy() # new image, reset everything
	replace_flag = 0
	chatbot = []
	img_list = []
	try:
	llm_message = chat.upload_img(gr_img, chat_state, img_list)
	print(f"Image uploaded successfully. img_list length: {len(img_list)}")
	except Exception as e:
	print(f"Error uploading image: {e}")
	return "Error uploading image. Please try again.", chatbot, chat_state, img_list, 0, replace_flag
	upload_flag = 0
	elif gr_img is None:
	return "Please upload a video first.", chatbot, chat_state, img_list, upload_flag, replace_flag

	chat.ask(user_message, chat_state)
	print('user_message: ', user_message)
	print('chat_state: ', chat_state)

	chatbot = chatbot + [[user_message, None]]

	if '[identify]' in user_message:
	visual_img, _ = visualize_all_bbox_together(gr_img, user_message)
	if visual_img is not None:
	file_path = save_tmp_img(visual_img)
	chatbot = chatbot + [[(file_path,), None]]

	return text_box_show, chatbot, chat_state, img_list, upload_flag, replace_flag


	def gradio_answer(chatbot, chat_state, img_list, temperature):
	print("--gradio_answer--")
	# print('img_list: ', img_list)
	llm_message = chat.answer(conv=chat_state,
	img_list=img_list,
	temperature=temperature,
	max_new_tokens=500,
	max_length=2000)[0]
	chatbot[-1][1] = llm_message
	print('gradio_answer: ', llm_message)

	return chatbot, chat_state

	def process_english_text(text):
	if len(text) < 2:
	return text
	text = text[0].upper() + text[1:]

	sentences = text.split('. ')
	corrected_sentences = [s.capitalize() for s in sentences]
	text = '. '.join(corrected_sentences)

	if text.endswith(','):
	text = text[:-1]
	if not text.endswith('.'):
	text += '.'

	return text

	@spaces.GPU
	def gradio_stream_answer(chatbot, chat_state, img_list, temperature):
	print('---gradio_stream_answer---')
	print(f"img_list length: {len(img_list)}")

	# Check if img_list is empty
	if len(img_list) == 0:
	error_msg = "No image/video uploaded. Please upload a video first."
	print(error_msg)
	if len(chatbot) > 0:
	chatbot[-1][1] = error_msg
	yield chatbot, chat_state
	return

	if len(img_list) > 0:
	if not isinstance(img_list[0], torch.Tensor):
	chat.encode_img(img_list)
	print(chat)

	try:
	streamer = chat.stream_answer(conv=chat_state,
	img_list=img_list,
	temperature=temperature,
	max_new_tokens=500,
	max_length=2000)
	output = ''
	print('streamer:', streamer)
	for new_output in streamer:
	escapped = escape_markdown(new_output)
	output += escapped
	chatbot[-1][1] = output
	chatbot[-1][1] = process_english_text(chatbot[-1][1])
	yield chatbot, chat_state
	chat_state.messages[-1][1] = '</s>'
	print('output:', output)
	except Exception as e:
	error_msg = f"Error generating response: {str(e)}"
	print(error_msg)
	if len(chatbot) > 0:
	chatbot[-1][1] = error_msg
	yield chatbot, chat_state

	return chatbot, chat_state


	def gradio_visualize(chatbot, gr_img):
	# Safety check for empty chatbot or None response
	if len(chatbot) == 0 or chatbot[-1][1] is None:
	return chatbot

	if isinstance(gr_img, dict):
	gr_img, mask = gr_img['image'], gr_img['mask']

	unescaped = reverse_escape(chatbot[-1][1])
	visual_img, generation_color = visualize_all_bbox_together(gr_img, unescaped)
	if visual_img is not None:
	if len(generation_color):
	chatbot[-1][1] = generation_color
	file_path = save_tmp_img(visual_img)
	chatbot = chatbot + [[None, (file_path,)]]

	return chatbot


	def gradio_taskselect(idx):
	prompt_list = [
	'',
	'[reason] ',
	'[emotion] ',
	'[visual] ',
	'[audio] '
	]
	instruct_list = [
	'Hint: Type in whatever you want',
	'Hint: Send the command to multimodal emotion reasoning',
	'Hint: Send the command to multimodal emotion recognition',
	'Hint: Send the command to generate visual description',
	'Hint: Send the command to generate audio description'
	]
	return prompt_list[idx], instruct_list[idx]


	chat = Chat(model, vis_processor, device=device)

	title = """<h1 align="center">Emotion-LLaMA Demo</h1>"""
	description = 'Welcome to Our Emotion-LLaMA Chatbot Demo!'
	article = """<p><a href='https://anonymous.4open.science/r/Emotion-LLaMA'><img src='https://img.shields.io/badge/Project-Page-Green'></a></p>"""

	introduction = '''
	For Abilities Involging Multimodal Emotion Understanding:
	1. Reason: Click Send to generate a multimodal emotion description.
	2. Emotion: Click Send to generate an emotion label.
	3. Visual: Click Send to generate a visual description.
	4. Audio: Click Send to generate an audio description.
	5. No Tag: Input whatever you want and click Send without any tagging.
	You can also simply chat in free form!
	'''

	text_input = gr.Textbox(placeholder='Upload your image and chat', interactive=True, show_label=False, container=False, scale=8)

	with gr.Blocks() as demo:
	gr.Markdown(title)
	# gr.Markdown(description)
	gr.Markdown(article)

	with gr.Row():
	with gr.Column(scale=0.5):
	# image = gr.Image(type="pil", tool='sketch', brush_radius=20)
	image = gr.Video(sources=["upload", "webcam"])

	temperature = gr.Slider(
	minimum=0.1,
	maximum=1.5,
	value=0.2,
	step=0.1,
	interactive=True,
	label="Temperature",
	)

	clear = gr.Button("Restart")

	gr.Markdown(introduction)

	with gr.Column():
	chat_state = gr.State(value=None)
	img_list = gr.State(value=[])
	chatbot = gr.Chatbot(label='Emotion-LLaMA')

	# Updated Dataset component for Gradio 5
	dataset = gr.Dataset(
	components=[gr.Textbox(visible=False)],
	samples=[['No Tag'], ['reason'], ['emotion'], ['visual'], ['audio']],
	type="index",
	label='Task Shortcuts',
	)
	task_inst = gr.Markdown('Hint: Upload your video and chat')
	with gr.Row():
	text_input.render()
	send = gr.Button("Send", variant='primary', size='sm', scale=1)

	upload_flag = gr.State(value=0)
	replace_flag = gr.State(value=0)

	# Updated upload trigger for Gradio 5 - fixed parameter order
	image.upload(image_upload_trigger, [image, upload_flag, replace_flag, img_list], [upload_flag, replace_flag])

	# Updated Examples component for Gradio 5 - this is the key fix!
	with gr.Row():
	with gr.Column():
	examples1 = gr.Examples(
	examples=[
	["examples/samplenew_00004251.mp4", "[detection] face"],
	["examples/sample_00000338.mp4", "The person in video says: Oh no, my phone and wallet are all in my bag. [emotion] Please determine which emotion label in the video represents: happy, sad, neutral, angry, worried, surprise."],
	["examples/sample_00000669.mp4", "The person in video says: Why are you looking at me like this? It's just a woman, so you have to have something to do with me. [emotion] Determine the emotional state shown in the video, choosing from happy, sad, neutral, angry, worried, or surprise."],
	["examples/sample_00003462.mp4", "The person in video says: Do you believe that you push me around? [emotion] Assess and label the emotion evident in the video: could it be happy, sad, neutral, angry, worried, surprise?"],
	["examples/sample_00000727.mp4", "The person in video says: No, this, I have to get up! You, I'm sorry, everyone. I'm sorry, it's from the German side. [emotion] Identify the displayed emotion in the video: is it happy, sad, neutral, angry, worried, or surprise?"],
	["examples/samplenew_00061200.mp4", "The person in video says: Me: I'm not going in anymore, scared. [emotion] Identify the displayed emotion in the video: is it happy, sad, neutral, angry, fear, contempt, doubt, worried, or surprise?"],
	],
	inputs=[image, text_input],
	# Remove fn and outputs - let Examples handle this automatically in Gradio 5
	)
	with gr.Column():
	examples2 = gr.Examples(
	examples=[
	["examples/samplenew_00051251.mp4", "In what state is the person in the video, say the following: \"Do you really think so?\""],
	["examples/sample_00004735.mp4", "[visual] What are the emotions of the woman in the video?"],
	["examples/sample_00002422.mp4", "[audio] Analyze the speaker's voice in the video."],
	["examples/sample_00001073.mp4", "The person in video says: Make him different from before. I like the way you are now. [reason] Please analyze all the clues in the video and reason out the emotional label of the person in the video."],
	["examples/sample_00004671.mp4", "The person in video says: Won't you? Impossible! Fan Xiaomei is not such a person. [reason] What are the facial expressions and vocal tone used in the video? What is the intended meaning behind his words? Which emotion does this reflect?"],
	["examples/sample_00005854.mp4", "The person in video says: Bastard! Boss, you don't choose, you prefer. [reason] Please integrate information from various modalities to infer the emotional category of the person in the video."],
	],
	inputs=[image, text_input],
	# Remove fn and outputs - let Examples handle this automatically in Gradio 5
	)

	dataset.click(
	gradio_taskselect,
	inputs=[dataset],
	outputs=[text_input, task_inst],
	show_progress="hidden",
	queue=False,
	)

	text_input.submit(
	gradio_ask,
	[text_input, chatbot, chat_state, image, img_list, upload_flag, replace_flag],
	[text_input, chatbot, chat_state, img_list, upload_flag, replace_flag], queue=False
	).then(
	gradio_stream_answer,
	[chatbot, chat_state, img_list, temperature],
	[chatbot, chat_state]
	).then(
	gradio_visualize,
	[chatbot, image],
	[chatbot],
	queue=False,
	)

	send.click(
	gradio_ask,
	[text_input, chatbot, chat_state, image, img_list, upload_flag, replace_flag],
	[text_input, chatbot, chat_state, img_list, upload_flag, replace_flag], queue=False
	).then(
	gradio_stream_answer,
	[chatbot, chat_state, img_list, temperature],
	[chatbot, chat_state]
	).then(
	gradio_visualize,
	[chatbot, image],
	[chatbot],
	queue=False,
	)

	clear.click(gradio_reset, [chat_state, img_list], [chatbot, image, text_input, chat_state, img_list], queue=False)

	demo.queue()
	demo.launch()