Spaces:
Sleeping
Sleeping
import gradio as gr | |
import base64 | |
import io | |
import requests | |
import json | |
from PIL import ImageDraw | |
from io import BytesIO | |
chat_log = [] | |
request_count = 0 | |
now_session_id = "" | |
chatbot_css = """ | |
<style> | |
.chat-container { | |
display: flex; | |
flex-direction: column; | |
overflow-y: auto; | |
max-height: 630px; | |
margin: 10px; | |
} | |
.user-message, .bot-message { | |
margin: 5px; | |
padding: 10px; | |
border-radius: 10px; | |
} | |
.user-message { | |
text-align: right; | |
background-color: #7B68EE; | |
color: white; | |
align-self: flex-end; | |
} | |
.bot-message { | |
text-align: left; | |
background-color: #ADD8E6; | |
color: black; | |
align-self: flex-start; | |
} | |
.user-image { | |
text-align: right; | |
align-self: flex-end; | |
max-width: 150px; | |
max-height: 300px; | |
} | |
.bot-image { | |
text-align: left; | |
align-self: flex-start; | |
max-width: 200px; | |
max-height: 400px; | |
} | |
</style> | |
""" | |
def encode_image(image): | |
buffer = BytesIO() | |
image.save(buffer, format="JPEG") | |
encoded_image = base64.b64encode(buffer.getvalue()).decode('utf-8') | |
return encoded_image | |
def get_action(image, query, session_id): | |
image_base = encode_image(image) | |
headers = { | |
'Authorization': "Bearer sk-6bddfc116de744c3aa1d66893cc87b20", | |
'Content-Type': 'application/json' | |
} | |
data = { | |
"model": "pre-Mobile_Agent_Server-1664", | |
"input": { | |
"screenshot": image_base, | |
"query": query, | |
"session_id": session_id | |
} | |
} | |
response = requests.post("https://poc-dashscope.aliyuncs.com/api/v1/services/aigc/multimodal-generation/generation", headers=headers, data=json.dumps(data), timeout=1500) | |
return response | |
def image_to_base64(image): | |
buffered = io.BytesIO() | |
image.save(buffered, format="PNG") | |
img_str = base64.b64encode(buffered.getvalue()).decode("utf-8") | |
img_html = f'<img src="data:image/png;base64,{img_str}" />' | |
return img_html | |
def chatbot(image, text): | |
global chat_log, request_count, now_session_id | |
request_count += 1 | |
user_msg = "<div class='user-message'>{}</div>".format(text) | |
if image is not None: | |
user_img_html = image_to_base64(image) | |
user_msg += "<div class='user-image'>{}</div>".format(user_img_html) | |
if request_count == 1: | |
try: | |
response = get_action(image, text, "") | |
action = response.json()['output']['action'] | |
parameter = response.json()['output']['parameter'] | |
session_id = response.json()['output']['session_id'] | |
now_session_id = session_id | |
except: | |
print(response) | |
else: | |
try: | |
response = get_action(image, "", now_session_id) | |
action = response.json()['output']['action'] | |
parameter = response.json()['output']['parameter'] | |
except: | |
print(response) | |
if action == 'end': | |
if parameter == '': | |
bot_response = "The instructions have been completed. Please click \"Clear\"." | |
else: | |
bot_response = str(parameter) | |
elif action == 'tap': | |
bot_response = "Please click on the red circle and upload the current screenshot again." | |
parameter = json.loads(parameter) | |
x, y = parameter[0], parameter[1] | |
radius = 50 | |
draw = ImageDraw.Draw(image) | |
draw.ellipse([x - radius, y - radius, x + radius, y + radius], outline='red', width=4) | |
elif action == 'slide': | |
bot_response = "Please slide from blue circle to red circle and upload the current screenshot again." | |
parameter = json.loads(parameter) | |
x1, y1, x2, y2 = parameter[0][0], parameter[0][1], parameter[1][0], parameter[1][1] | |
radius = 50 | |
draw = ImageDraw.Draw(image) | |
draw.ellipse([x1 - radius, y1 - radius, x1 + radius, y1 + radius], outline='red', width=5) | |
draw.ellipse([x2 - radius, y2 - radius, x2 + radius, y2 + radius], outline='blue', width=5) | |
elif action == 'type': | |
parameter = str(parameter) | |
bot_response = f"Please type the \"{parameter}\" and upload the current screenshot again." | |
elif action == 'back': | |
bot_response = f"Please back to previous page and upload the current screenshot again." | |
elif action == 'exit': | |
bot_response = f"Please back to home page and upload the current screenshot again." | |
bot_msg = "<div class='bot-message'>{}</div>".format(bot_response) | |
if image is not None: | |
bot_img_html = image_to_base64(image) | |
bot_msg += "<div class='bot-image'>{}</div>".format(bot_img_html) | |
chat_log.append(user_msg) | |
chat_log.append(bot_msg) | |
chat_html = "<div class='chat-container'>{}</div>".format("".join(chat_log)) | |
return chatbot_css + chat_html | |
def lock_input(image, instruction): | |
return gr.update(value=instruction, interactive=False), gr.update(value=None) | |
def reset_demo(): | |
global chat_log, request_count, now_session_id | |
chat_log = [] | |
request_count = 0 | |
now_session_id = "" | |
return "", gr.update(value="", interactive=True) | |
tos_markdown = ("""### Terms of use | |
1. In the Instruction field, enter the instruction you want to execute. | |
2. In the Screenshot field, upload a screenshot of your current mobile device. | |
3. Click \"Submit\" to get the operation. You need to operate your mobile device according to the operation and then upload the current screenshot again. | |
4. Once the Instruction is entered, it cannot be changed midway. If the instruction has been completed or you think the current operation cannot complete the instruction, click \"Clear\". | |
5. The 5 cases in \"Examples\" are a complete flow. Click and submit from top to bottom experience.""") | |
text_input = gr.Textbox(label="Instruction", placeholder="Input your instruction") | |
with gr.Blocks() as demo: | |
gr.Markdown("# Mobile-Agent") | |
with gr.Row(): | |
with gr.Column(scale=4): | |
gr.Markdown(tos_markdown) | |
with gr.Row(): | |
image_input = gr.Image(label="Screenshot", type="pil", height=570, width=300) | |
gr.Examples(examples=[ | |
["./example/1.jpg", "Turn on the dark mode"], | |
["./example/2.jpg", "Turn on the dark mode"], | |
["./example/3.jpg", "Turn on the dark mode"], | |
["./example/4.jpg", "Turn on the dark mode"], | |
["./example/5.jpg", "Turn on the dark mode"], | |
], inputs=[image_input, text_input]) | |
with gr.Column(scale=6): | |
text_input.render() | |
with gr.Row(): | |
submit_button = gr.Button("Submit") | |
clear_button = gr.Button("Clear") | |
output_component = gr.HTML(label="Chat history") | |
submit_button.click( | |
fn=lambda image, instruction: (chatbot(image, instruction),) + lock_input(image, instruction), | |
inputs=[image_input, text_input], | |
outputs=[output_component, text_input, image_input] | |
) | |
clear_button.click( | |
fn=reset_demo, | |
inputs=[], | |
outputs=[output_component, text_input] | |
) | |
demo.queue().launch(share=False) |