Spaces:

junyangwang0410
/

Mobile-Agent

Sleeping

Mobile-Agent / app.py

阳渠

Add application file

0f17fe9 4 months ago

7.22 kB

	import gradio as gr
	import base64
	import io
	import requests
	import json
	from PIL import ImageDraw
	from io import BytesIO

	chat_log = []
	request_count = 0
	now_session_id = ""

	chatbot_css = """
	<style>
	.chat-container {
	display: flex;
	flex-direction: column;
	overflow-y: auto;
	max-height: 630px;
	margin: 10px;
	}
	.user-message, .bot-message {
	margin: 5px;
	padding: 10px;
	border-radius: 10px;
	}
	.user-message {
	text-align: right;
	background-color: #7B68EE;
	color: white;
	align-self: flex-end;
	}
	.bot-message {
	text-align: left;
	background-color: #ADD8E6;
	color: black;
	align-self: flex-start;
	}
	.user-image {
	text-align: right;
	align-self: flex-end;
	max-width: 150px;
	max-height: 300px;
	}
	.bot-image {
	text-align: left;
	align-self: flex-start;
	max-width: 200px;
	max-height: 400px;
	}
	</style>
	"""

	def encode_image(image):
	buffer = BytesIO()
	image.save(buffer, format="JPEG")
	encoded_image = base64.b64encode(buffer.getvalue()).decode('utf-8')
	return encoded_image

	def get_action(image, query, session_id):
	image_base = encode_image(image)

	headers = {
	'Authorization': "Bearer sk-6bddfc116de744c3aa1d66893cc87b20",
	'Content-Type': 'application/json'
	}

	data = {
	"model": "pre-Mobile_Agent_Server-1664",
	"input": {
	"screenshot": image_base,
	"query": query,
	"session_id": session_id
	}
	}

	response = requests.post("https://poc-dashscope.aliyuncs.com/api/v1/services/aigc/multimodal-generation/generation", headers=headers, data=json.dumps(data), timeout=1500)
	return response

	def image_to_base64(image):
	buffered = io.BytesIO()
	image.save(buffered, format="PNG")
	img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
	img_html = f'<img src="data:image/png;base64,{img_str}" />'
	return img_html

	def chatbot(image, text):
	global chat_log, request_count, now_session_id
	request_count += 1

	user_msg = "<div class='user-message'>{}</div>".format(text)
	if image is not None:
	user_img_html = image_to_base64(image)
	user_msg += "<div class='user-image'>{}</div>".format(user_img_html)

	if request_count == 1:
	try:
	response = get_action(image, text, "")
	action = response.json()['output']['action']
	parameter = response.json()['output']['parameter']
	session_id = response.json()['output']['session_id']
	now_session_id = session_id
	except:
	print(response)
	else:
	try:
	response = get_action(image, "", now_session_id)
	action = response.json()['output']['action']
	parameter = response.json()['output']['parameter']
	except:
	print(response)

	if action == 'end':
	if parameter == '':
	bot_response = "The instructions have been completed. Please click \"Clear\"."
	else:
	bot_response = str(parameter)

	elif action == 'tap':
	bot_response = "Please click on the red circle and upload the current screenshot again."
	parameter = json.loads(parameter)
	x, y = parameter[0], parameter[1]
	radius = 50

	draw = ImageDraw.Draw(image)
	draw.ellipse([x - radius, y - radius, x + radius, y + radius], outline='red', width=4)

	elif action == 'slide':
	bot_response = "Please slide from blue circle to red circle and upload the current screenshot again."
	parameter = json.loads(parameter)
	x1, y1, x2, y2 = parameter[0][0], parameter[0][1], parameter[1][0], parameter[1][1]
	radius = 50

	draw = ImageDraw.Draw(image)
	draw.ellipse([x1 - radius, y1 - radius, x1 + radius, y1 + radius], outline='red', width=5)
	draw.ellipse([x2 - radius, y2 - radius, x2 + radius, y2 + radius], outline='blue', width=5)

	elif action == 'type':
	parameter = str(parameter)
	bot_response = f"Please type the \"{parameter}\" and upload the current screenshot again."

	elif action == 'back':
	bot_response = f"Please back to previous page and upload the current screenshot again."

	elif action == 'exit':
	bot_response = f"Please back to home page and upload the current screenshot again."

	bot_msg = "<div class='bot-message'>{}</div>".format(bot_response)
	if image is not None:
	bot_img_html = image_to_base64(image)
	bot_msg += "<div class='bot-image'>{}</div>".format(bot_img_html)

	chat_log.append(user_msg)
	chat_log.append(bot_msg)

	chat_html = "<div class='chat-container'>{}</div>".format("".join(chat_log))
	return chatbot_css + chat_html

	def lock_input(image, instruction):
	return gr.update(value=instruction, interactive=False), gr.update(value=None)

	def reset_demo():
	global chat_log, request_count, now_session_id
	chat_log = []
	request_count = 0
	now_session_id = ""
	return "", gr.update(value="", interactive=True)

	tos_markdown = ("""### Terms of use
	1. In the Instruction field, enter the instruction you want to execute.
	2. In the Screenshot field, upload a screenshot of your current mobile device.
	3. Click \"Submit\" to get the operation. You need to operate your mobile device according to the operation and then upload the current screenshot again.
	4. Once the Instruction is entered, it cannot be changed midway. If the instruction has been completed or you think the current operation cannot complete the instruction, click \"Clear\".
	5. The 5 cases in \"Examples\" are a complete flow. Click and submit from top to bottom experience.""")

	text_input = gr.Textbox(label="Instruction", placeholder="Input your instruction")
	with gr.Blocks() as demo:
	gr.Markdown("# Mobile-Agent")
	with gr.Row():
	with gr.Column(scale=4):
	gr.Markdown(tos_markdown)
	with gr.Row():
	image_input = gr.Image(label="Screenshot", type="pil", height=570, width=300)
	gr.Examples(examples=[
	["./example/1.jpg", "Turn on the dark mode"],
	["./example/2.jpg", "Turn on the dark mode"],
	["./example/3.jpg", "Turn on the dark mode"],
	["./example/4.jpg", "Turn on the dark mode"],
	["./example/5.jpg", "Turn on the dark mode"],
	], inputs=[image_input, text_input])

	with gr.Column(scale=6):
	text_input.render()
	with gr.Row():
	submit_button = gr.Button("Submit")
	clear_button = gr.Button("Clear")
	output_component = gr.HTML(label="Chat history")

	submit_button.click(
	fn=lambda image, instruction: (chatbot(image, instruction),) + lock_input(image, instruction),
	inputs=[image_input, text_input],
	outputs=[output_component, text_input, image_input]
	)

	clear_button.click(
	fn=reset_demo,
	inputs=[],
	outputs=[output_component, text_input]
	)

	demo.queue().launch(share=False)