Spaces:

junyangwang0410
/

Mobile-Agent

Sleeping

App Files Files Community

阳渠 commited on Feb 21

Commit

0f17fe9

•

1 Parent(s): 664f979

Add application file

Browse files

Files changed (1) hide show

app.py +211 -0

app.py ADDED Viewed

	@@ -0,0 +1,211 @@

+import gradio as gr
+import base64
+import io
+import requests
+import json
+from PIL import ImageDraw
+from io import BytesIO
+chat_log = []
+request_count = 0
+now_session_id = ""
+chatbot_css = """
+<style>
+.chat-container {
+    display: flex;
+    flex-direction: column;
+    overflow-y: auto;
+    max-height: 630px;
+    margin: 10px;
+}
+.user-message, .bot-message {
+    margin: 5px;
+    padding: 10px;
+    border-radius: 10px;
+}
+.user-message {
+    text-align: right;
+    background-color: #7B68EE;
+    color: white;
+    align-self: flex-end;
+}
+.bot-message {
+    text-align: left;
+    background-color: #ADD8E6;
+    color: black;
+    align-self: flex-start;
+}
+.user-image {
+    text-align: right;
+    align-self: flex-end;
+    max-width: 150px;
+    max-height: 300px;
+}
+.bot-image {
+    text-align: left;
+    align-self: flex-start;
+    max-width: 200px;
+    max-height: 400px;
+}
+</style>
+"""
+def encode_image(image):
+    buffer = BytesIO()
+    image.save(buffer, format="JPEG")
+    encoded_image = base64.b64encode(buffer.getvalue()).decode('utf-8')
+    return encoded_image
+def get_action(image, query, session_id):
+    image_base = encode_image(image)
+    headers = {
+        'Authorization': "Bearer sk-6bddfc116de744c3aa1d66893cc87b20",
+        'Content-Type': 'application/json'
+    }
+    data = {
+        "model": "pre-Mobile_Agent_Server-1664",
+        "input": {
+            "screenshot": image_base,
+            "query": query,
+            "session_id": session_id
+        }
+    }
+    response = requests.post("https://poc-dashscope.aliyuncs.com/api/v1/services/aigc/multimodal-generation/generation", headers=headers, data=json.dumps(data), timeout=1500)
+    return response
+def image_to_base64(image):
+    buffered = io.BytesIO()
+    image.save(buffered, format="PNG")
+    img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
+    img_html = f'<img src="data:image/png;base64,{img_str}" />'
+    return img_html
+def chatbot(image, text):
+    global chat_log, request_count, now_session_id
+    request_count += 1
+    user_msg = "<div class='user-message'>{}</div>".format(text)
+    if image is not None:
+        user_img_html = image_to_base64(image)
+        user_msg += "<div class='user-image'>{}</div>".format(user_img_html)
+    if request_count == 1:
+        try:
+            response = get_action(image, text, "")
+            action = response.json()['output']['action']
+            parameter = response.json()['output']['parameter']
+            session_id = response.json()['output']['session_id']
+            now_session_id = session_id
+        except:
+            print(response)
+    else:
+        try:
+            response = get_action(image, "", now_session_id)
+            action = response.json()['output']['action']
+            parameter = response.json()['output']['parameter']
+        except:
+            print(response)
+    if action == 'end':
+        if parameter == '':
+            bot_response = "The instructions have been completed. Please click \"Clear\"."
+        else:
+            bot_response = str(parameter)
+    elif action == 'tap':
+        bot_response = "Please click on the red circle and upload the current screenshot again."
+        parameter = json.loads(parameter)
+        x, y = parameter[0], parameter[1]
+        radius = 50
+        draw = ImageDraw.Draw(image)
+        draw.ellipse([x - radius, y - radius, x + radius, y + radius], outline='red', width=4)
+    elif action == 'slide':
+        bot_response = "Please slide from blue circle to red circle and upload the current screenshot again."
+        parameter = json.loads(parameter)
+        x1, y1, x2, y2 = parameter[0][0], parameter[0][1], parameter[1][0], parameter[1][1]
+        radius = 50
+        draw = ImageDraw.Draw(image)
+        draw.ellipse([x1 - radius, y1 - radius, x1 + radius, y1 + radius], outline='red', width=5)
+        draw.ellipse([x2 - radius, y2 - radius, x2 + radius, y2 + radius], outline='blue', width=5)
+    elif action == 'type':
+        parameter = str(parameter)
+        bot_response = f"Please type the \"{parameter}\" and upload the current screenshot again."
+    elif action == 'back':
+        bot_response = f"Please back to previous page and upload the current screenshot again."
+    elif action == 'exit':
+        bot_response = f"Please back to home page and upload the current screenshot again."
+    bot_msg = "<div class='bot-message'>{}</div>".format(bot_response)
+    if image is not None:
+        bot_img_html = image_to_base64(image)
+        bot_msg += "<div class='bot-image'>{}</div>".format(bot_img_html)
+    chat_log.append(user_msg)
+    chat_log.append(bot_msg)
+    chat_html = "<div class='chat-container'>{}</div>".format("".join(chat_log))
+    return chatbot_css + chat_html
+def lock_input(image, instruction):
+    return gr.update(value=instruction, interactive=False), gr.update(value=None)
+def reset_demo():
+    global chat_log, request_count, now_session_id
+    chat_log = []
+    request_count = 0
+    now_session_id = ""
+    return "", gr.update(value="", interactive=True)
+tos_markdown = ("""### Terms of use
+1. In the Instruction field, enter the instruction you want to execute.
+2. In the Screenshot field, upload a screenshot of your current mobile device.
+3. Click \"Submit\" to get the operation. You need to operate your mobile device according to the operation and then upload the current screenshot again.
+4. Once the Instruction is entered, it cannot be changed midway. If the instruction has been completed or you think the current operation cannot complete the instruction, click \"Clear\".
+5. The 5 cases in \"Examples\" are a complete flow. Click and submit from top to bottom experience.""")
+text_input = gr.Textbox(label="Instruction", placeholder="Input your instruction")
+with gr.Blocks() as demo:
+    gr.Markdown("# Mobile-Agent")
+    with gr.Row():
+        with gr.Column(scale=4):
+            gr.Markdown(tos_markdown)
+            with gr.Row():
+                image_input = gr.Image(label="Screenshot", type="pil", height=570, width=300)
+                gr.Examples(examples=[
+                    ["./example/1.jpg", "Turn on the dark mode"],
+                    ["./example/2.jpg", "Turn on the dark mode"],
+                    ["./example/3.jpg", "Turn on the dark mode"],
+                    ["./example/4.jpg", "Turn on the dark mode"],
+                    ["./example/5.jpg", "Turn on the dark mode"],
+                ], inputs=[image_input, text_input])
+        with gr.Column(scale=6):
+            text_input.render()
+            with gr.Row():
+                submit_button = gr.Button("Submit")
+                clear_button = gr.Button("Clear")
+            output_component = gr.HTML(label="Chat history")
+    submit_button.click(
+        fn=lambda image, instruction: (chatbot(image, instruction),) + lock_input(image, instruction),
+        inputs=[image_input, text_input],
+        outputs=[output_component, text_input, image_input]
+    )
+    clear_button.click(
+        fn=reset_demo,
+        inputs=[],
+        outputs=[output_component, text_input]
+    )
+demo.queue().launch(share=False)