import gradio as gr import base64 import io import requests import json from PIL import ImageDraw from io import BytesIO import os chat_log = [] request_count = 0 now_session_id = "" TOKEN = os.environ.get('TOKEN') URL = os.environ.get('URL') chatbot_css = """ """ def encode_image(image): buffer = BytesIO() image.save(buffer, format="JPEG") encoded_image = base64.b64encode(buffer.getvalue()).decode('utf-8') return encoded_image def get_action(image, query, session_id): image_base = encode_image(image) headers = { 'Authorization': f"Bearer {TOKEN}", 'Content-Type': 'application/json' } data = { "model": "pre-Mobile_Agent_Server-1664", "input": { "screenshot": image_base, "query": query, "session_id": session_id } } while True: try: response = requests.post(URL, headers=headers, data=json.dumps(data), timeout=1500) action = response.json()['output']['action'] parameter = response.json()['output']['parameter'] session_id = response.json()['output']['session_id'] except: print(response) else: if response.status_code == 200: break return action, parameter, session_id def image_to_base64(image): buffered = io.BytesIO() image.save(buffered, format="PNG") img_str = base64.b64encode(buffered.getvalue()).decode("utf-8") img_html = f'

' return img_html def chatbot(image, text): global chat_log, request_count, now_session_id request_count += 1 user_msg = "

{}

".format(text) if image is not None: user_img_html = image_to_base64(image) user_msg += "

{}

".format(user_img_html) if request_count == 1: action, parameter, session_id = get_action(image, text, "") now_session_id = session_id else: action, parameter, session_id = get_action(image, "", now_session_id) if action == 'end': if parameter == '': bot_response = "The instructions have been completed. Please click \"Clear\"." else: bot_response = str(parameter) elif action == 'tap': bot_response = "Please click on the red circle and upload the current screenshot again." parameter = json.loads(parameter) x, y = parameter[0], parameter[1] radius = 75 draw = ImageDraw.Draw(image) draw.ellipse([x - radius, y - radius, x + radius, y + radius], outline='red', width=10) elif action == 'slide': bot_response = "Please slide from blue circle to red circle and upload the current screenshot again." parameter = json.loads(parameter) x1, y1, x2, y2 = parameter[0][0], parameter[0][1], parameter[1][0], parameter[1][1] radius = 75 draw = ImageDraw.Draw(image) draw.ellipse([x1 - radius, y1 - radius, x1 + radius, y1 + radius], outline='red', width=10) draw.ellipse([x2 - radius, y2 - radius, x2 + radius, y2 + radius], outline='blue', width=10) elif action == 'type': parameter = str(parameter) bot_response = f"Please type the \"{parameter}\" and upload the current screenshot again." elif action == 'back': bot_response = f"Please back to previous page and upload the current screenshot again." elif action == 'exit': bot_response = f"Please back to home page and upload the current screenshot again." bot_msg = "

{}

".format(bot_response) if image is not None: bot_img_html = image_to_base64(image) bot_msg += "

{}

".format(bot_img_html) chat_log.append(user_msg) chat_log.append(bot_msg) chat_html = "

{}

".format("".join(chat_log)) return chatbot_css + chat_html def lock_input(image, instruction): return gr.update(value=instruction, interactive=False), gr.update(value=None) def reset_demo(): global chat_log, request_count, now_session_id chat_log = [] request_count = 0 now_session_id = "" return "", gr.update(value="", interactive=True) tos_markdown = ("""**Terms of use** 1. In the Instruction field, enter the instruction you want to execute. 2. In the Screenshot field, upload a screenshot of your current mobile device. 3. Click \"Submit\" to get the operation. You need to operate your mobile device according to the operation and then upload the current screenshot again. 4. Once the Instruction is entered, it cannot be changed midway. If the instruction has been completed or you think the current operation cannot complete the instruction, click \"Clear\". 5. The 5 cases in \"Examples\" are a complete flow. Click and submit from top to bottom to experience.""") title_markdowm = ("""# Mobile-Agent: Autonomous Multi-Modal Mobile Device Agent with Visual Perception

If you like our project, please give us a star ✨ on Github for latest update.""") text_input = gr.Textbox(label="Instruction", placeholder="Input your instruction") with gr.Blocks() as demo: with gr.Row(): gr.Markdown(title_markdowm) with gr.Row(): with gr.Column(scale=5): gr.Markdown(tos_markdown) with gr.Row(): image_input = gr.Image(label="Screenshot", type="pil", height=570, width=300) gr.Examples(examples=[ ["./example/1.jpg", "Turn on the dark mode"], ["./example/2.jpg", "Turn on the dark mode"], ["./example/3.jpg", "Turn on the dark mode"], ["./example/4.jpg", "Turn on the dark mode"], ["./example/5.jpg", "Turn on the dark mode"], ], inputs=[image_input, text_input]) with gr.Column(scale=6): text_input.render() with gr.Row(): submit_button = gr.Button("Submit") clear_button = gr.Button("Clear") output_component = gr.HTML(label="Chat history") submit_button.click( fn=lambda image, instruction: (chatbot(image, instruction),) + lock_input(image, instruction), inputs=[image_input, text_input], outputs=[output_component, text_input, image_input] ) clear_button.click( fn=reset_demo, inputs=[], outputs=[output_component, text_input] ) demo.queue().launch(share=False)