Mobile-Agent / app.py
阳渠
Add application file
0f17fe9
import gradio as gr
import base64
import io
import requests
import json
from PIL import ImageDraw
from io import BytesIO
chat_log = []
request_count = 0
now_session_id = ""
chatbot_css = """
<style>
.chat-container {
display: flex;
flex-direction: column;
overflow-y: auto;
max-height: 630px;
margin: 10px;
}
.user-message, .bot-message {
margin: 5px;
padding: 10px;
border-radius: 10px;
}
.user-message {
text-align: right;
background-color: #7B68EE;
color: white;
align-self: flex-end;
}
.bot-message {
text-align: left;
background-color: #ADD8E6;
color: black;
align-self: flex-start;
}
.user-image {
text-align: right;
align-self: flex-end;
max-width: 150px;
max-height: 300px;
}
.bot-image {
text-align: left;
align-self: flex-start;
max-width: 200px;
max-height: 400px;
}
</style>
"""
def encode_image(image):
buffer = BytesIO()
image.save(buffer, format="JPEG")
encoded_image = base64.b64encode(buffer.getvalue()).decode('utf-8')
return encoded_image
def get_action(image, query, session_id):
image_base = encode_image(image)
headers = {
'Authorization': "Bearer sk-6bddfc116de744c3aa1d66893cc87b20",
'Content-Type': 'application/json'
}
data = {
"model": "pre-Mobile_Agent_Server-1664",
"input": {
"screenshot": image_base,
"query": query,
"session_id": session_id
}
}
response = requests.post("https://poc-dashscope.aliyuncs.com/api/v1/services/aigc/multimodal-generation/generation", headers=headers, data=json.dumps(data), timeout=1500)
return response
def image_to_base64(image):
buffered = io.BytesIO()
image.save(buffered, format="PNG")
img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
img_html = f'<img src="data:image/png;base64,{img_str}" />'
return img_html
def chatbot(image, text):
global chat_log, request_count, now_session_id
request_count += 1
user_msg = "<div class='user-message'>{}</div>".format(text)
if image is not None:
user_img_html = image_to_base64(image)
user_msg += "<div class='user-image'>{}</div>".format(user_img_html)
if request_count == 1:
try:
response = get_action(image, text, "")
action = response.json()['output']['action']
parameter = response.json()['output']['parameter']
session_id = response.json()['output']['session_id']
now_session_id = session_id
except:
print(response)
else:
try:
response = get_action(image, "", now_session_id)
action = response.json()['output']['action']
parameter = response.json()['output']['parameter']
except:
print(response)
if action == 'end':
if parameter == '':
bot_response = "The instructions have been completed. Please click \"Clear\"."
else:
bot_response = str(parameter)
elif action == 'tap':
bot_response = "Please click on the red circle and upload the current screenshot again."
parameter = json.loads(parameter)
x, y = parameter[0], parameter[1]
radius = 50
draw = ImageDraw.Draw(image)
draw.ellipse([x - radius, y - radius, x + radius, y + radius], outline='red', width=4)
elif action == 'slide':
bot_response = "Please slide from blue circle to red circle and upload the current screenshot again."
parameter = json.loads(parameter)
x1, y1, x2, y2 = parameter[0][0], parameter[0][1], parameter[1][0], parameter[1][1]
radius = 50
draw = ImageDraw.Draw(image)
draw.ellipse([x1 - radius, y1 - radius, x1 + radius, y1 + radius], outline='red', width=5)
draw.ellipse([x2 - radius, y2 - radius, x2 + radius, y2 + radius], outline='blue', width=5)
elif action == 'type':
parameter = str(parameter)
bot_response = f"Please type the \"{parameter}\" and upload the current screenshot again."
elif action == 'back':
bot_response = f"Please back to previous page and upload the current screenshot again."
elif action == 'exit':
bot_response = f"Please back to home page and upload the current screenshot again."
bot_msg = "<div class='bot-message'>{}</div>".format(bot_response)
if image is not None:
bot_img_html = image_to_base64(image)
bot_msg += "<div class='bot-image'>{}</div>".format(bot_img_html)
chat_log.append(user_msg)
chat_log.append(bot_msg)
chat_html = "<div class='chat-container'>{}</div>".format("".join(chat_log))
return chatbot_css + chat_html
def lock_input(image, instruction):
return gr.update(value=instruction, interactive=False), gr.update(value=None)
def reset_demo():
global chat_log, request_count, now_session_id
chat_log = []
request_count = 0
now_session_id = ""
return "", gr.update(value="", interactive=True)
tos_markdown = ("""### Terms of use
1. In the Instruction field, enter the instruction you want to execute.
2. In the Screenshot field, upload a screenshot of your current mobile device.
3. Click \"Submit\" to get the operation. You need to operate your mobile device according to the operation and then upload the current screenshot again.
4. Once the Instruction is entered, it cannot be changed midway. If the instruction has been completed or you think the current operation cannot complete the instruction, click \"Clear\".
5. The 5 cases in \"Examples\" are a complete flow. Click and submit from top to bottom experience.""")
text_input = gr.Textbox(label="Instruction", placeholder="Input your instruction")
with gr.Blocks() as demo:
gr.Markdown("# Mobile-Agent")
with gr.Row():
with gr.Column(scale=4):
gr.Markdown(tos_markdown)
with gr.Row():
image_input = gr.Image(label="Screenshot", type="pil", height=570, width=300)
gr.Examples(examples=[
["./example/1.jpg", "Turn on the dark mode"],
["./example/2.jpg", "Turn on the dark mode"],
["./example/3.jpg", "Turn on the dark mode"],
["./example/4.jpg", "Turn on the dark mode"],
["./example/5.jpg", "Turn on the dark mode"],
], inputs=[image_input, text_input])
with gr.Column(scale=6):
text_input.render()
with gr.Row():
submit_button = gr.Button("Submit")
clear_button = gr.Button("Clear")
output_component = gr.HTML(label="Chat history")
submit_button.click(
fn=lambda image, instruction: (chatbot(image, instruction),) + lock_input(image, instruction),
inputs=[image_input, text_input],
outputs=[output_component, text_input, image_input]
)
clear_button.click(
fn=reset_demo,
inputs=[],
outputs=[output_component, text_input]
)
demo.queue().launch(share=False)