阳渠 commited on
Commit
0f17fe9
1 Parent(s): 664f979

Add application file

Browse files
Files changed (1) hide show
  1. app.py +211 -0
app.py ADDED
@@ -0,0 +1,211 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import base64
3
+ import io
4
+ import requests
5
+ import json
6
+ from PIL import ImageDraw
7
+ from io import BytesIO
8
+
9
+ chat_log = []
10
+ request_count = 0
11
+ now_session_id = ""
12
+
13
+ chatbot_css = """
14
+ <style>
15
+ .chat-container {
16
+ display: flex;
17
+ flex-direction: column;
18
+ overflow-y: auto;
19
+ max-height: 630px;
20
+ margin: 10px;
21
+ }
22
+ .user-message, .bot-message {
23
+ margin: 5px;
24
+ padding: 10px;
25
+ border-radius: 10px;
26
+ }
27
+ .user-message {
28
+ text-align: right;
29
+ background-color: #7B68EE;
30
+ color: white;
31
+ align-self: flex-end;
32
+ }
33
+ .bot-message {
34
+ text-align: left;
35
+ background-color: #ADD8E6;
36
+ color: black;
37
+ align-self: flex-start;
38
+ }
39
+ .user-image {
40
+ text-align: right;
41
+ align-self: flex-end;
42
+ max-width: 150px;
43
+ max-height: 300px;
44
+ }
45
+ .bot-image {
46
+ text-align: left;
47
+ align-self: flex-start;
48
+ max-width: 200px;
49
+ max-height: 400px;
50
+ }
51
+ </style>
52
+ """
53
+
54
+ def encode_image(image):
55
+ buffer = BytesIO()
56
+ image.save(buffer, format="JPEG")
57
+ encoded_image = base64.b64encode(buffer.getvalue()).decode('utf-8')
58
+ return encoded_image
59
+
60
+ def get_action(image, query, session_id):
61
+ image_base = encode_image(image)
62
+
63
+ headers = {
64
+ 'Authorization': "Bearer sk-6bddfc116de744c3aa1d66893cc87b20",
65
+ 'Content-Type': 'application/json'
66
+ }
67
+
68
+ data = {
69
+ "model": "pre-Mobile_Agent_Server-1664",
70
+ "input": {
71
+ "screenshot": image_base,
72
+ "query": query,
73
+ "session_id": session_id
74
+ }
75
+ }
76
+
77
+ response = requests.post("https://poc-dashscope.aliyuncs.com/api/v1/services/aigc/multimodal-generation/generation", headers=headers, data=json.dumps(data), timeout=1500)
78
+ return response
79
+
80
+ def image_to_base64(image):
81
+ buffered = io.BytesIO()
82
+ image.save(buffered, format="PNG")
83
+ img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
84
+ img_html = f'<img src="data:image/png;base64,{img_str}" />'
85
+ return img_html
86
+
87
+ def chatbot(image, text):
88
+ global chat_log, request_count, now_session_id
89
+ request_count += 1
90
+
91
+ user_msg = "<div class='user-message'>{}</div>".format(text)
92
+ if image is not None:
93
+ user_img_html = image_to_base64(image)
94
+ user_msg += "<div class='user-image'>{}</div>".format(user_img_html)
95
+
96
+ if request_count == 1:
97
+ try:
98
+ response = get_action(image, text, "")
99
+ action = response.json()['output']['action']
100
+ parameter = response.json()['output']['parameter']
101
+ session_id = response.json()['output']['session_id']
102
+ now_session_id = session_id
103
+ except:
104
+ print(response)
105
+ else:
106
+ try:
107
+ response = get_action(image, "", now_session_id)
108
+ action = response.json()['output']['action']
109
+ parameter = response.json()['output']['parameter']
110
+ except:
111
+ print(response)
112
+
113
+ if action == 'end':
114
+ if parameter == '':
115
+ bot_response = "The instructions have been completed. Please click \"Clear\"."
116
+ else:
117
+ bot_response = str(parameter)
118
+
119
+ elif action == 'tap':
120
+ bot_response = "Please click on the red circle and upload the current screenshot again."
121
+ parameter = json.loads(parameter)
122
+ x, y = parameter[0], parameter[1]
123
+ radius = 50
124
+
125
+ draw = ImageDraw.Draw(image)
126
+ draw.ellipse([x - radius, y - radius, x + radius, y + radius], outline='red', width=4)
127
+
128
+ elif action == 'slide':
129
+ bot_response = "Please slide from blue circle to red circle and upload the current screenshot again."
130
+ parameter = json.loads(parameter)
131
+ x1, y1, x2, y2 = parameter[0][0], parameter[0][1], parameter[1][0], parameter[1][1]
132
+ radius = 50
133
+
134
+ draw = ImageDraw.Draw(image)
135
+ draw.ellipse([x1 - radius, y1 - radius, x1 + radius, y1 + radius], outline='red', width=5)
136
+ draw.ellipse([x2 - radius, y2 - radius, x2 + radius, y2 + radius], outline='blue', width=5)
137
+
138
+ elif action == 'type':
139
+ parameter = str(parameter)
140
+ bot_response = f"Please type the \"{parameter}\" and upload the current screenshot again."
141
+
142
+ elif action == 'back':
143
+ bot_response = f"Please back to previous page and upload the current screenshot again."
144
+
145
+ elif action == 'exit':
146
+ bot_response = f"Please back to home page and upload the current screenshot again."
147
+
148
+ bot_msg = "<div class='bot-message'>{}</div>".format(bot_response)
149
+ if image is not None:
150
+ bot_img_html = image_to_base64(image)
151
+ bot_msg += "<div class='bot-image'>{}</div>".format(bot_img_html)
152
+
153
+ chat_log.append(user_msg)
154
+ chat_log.append(bot_msg)
155
+
156
+ chat_html = "<div class='chat-container'>{}</div>".format("".join(chat_log))
157
+ return chatbot_css + chat_html
158
+
159
+ def lock_input(image, instruction):
160
+ return gr.update(value=instruction, interactive=False), gr.update(value=None)
161
+
162
+ def reset_demo():
163
+ global chat_log, request_count, now_session_id
164
+ chat_log = []
165
+ request_count = 0
166
+ now_session_id = ""
167
+ return "", gr.update(value="", interactive=True)
168
+
169
+ tos_markdown = ("""### Terms of use
170
+ 1. In the Instruction field, enter the instruction you want to execute.
171
+ 2. In the Screenshot field, upload a screenshot of your current mobile device.
172
+ 3. Click \"Submit\" to get the operation. You need to operate your mobile device according to the operation and then upload the current screenshot again.
173
+ 4. Once the Instruction is entered, it cannot be changed midway. If the instruction has been completed or you think the current operation cannot complete the instruction, click \"Clear\".
174
+ 5. The 5 cases in \"Examples\" are a complete flow. Click and submit from top to bottom experience.""")
175
+
176
+ text_input = gr.Textbox(label="Instruction", placeholder="Input your instruction")
177
+ with gr.Blocks() as demo:
178
+ gr.Markdown("# Mobile-Agent")
179
+ with gr.Row():
180
+ with gr.Column(scale=4):
181
+ gr.Markdown(tos_markdown)
182
+ with gr.Row():
183
+ image_input = gr.Image(label="Screenshot", type="pil", height=570, width=300)
184
+ gr.Examples(examples=[
185
+ ["./example/1.jpg", "Turn on the dark mode"],
186
+ ["./example/2.jpg", "Turn on the dark mode"],
187
+ ["./example/3.jpg", "Turn on the dark mode"],
188
+ ["./example/4.jpg", "Turn on the dark mode"],
189
+ ["./example/5.jpg", "Turn on the dark mode"],
190
+ ], inputs=[image_input, text_input])
191
+
192
+ with gr.Column(scale=6):
193
+ text_input.render()
194
+ with gr.Row():
195
+ submit_button = gr.Button("Submit")
196
+ clear_button = gr.Button("Clear")
197
+ output_component = gr.HTML(label="Chat history")
198
+
199
+ submit_button.click(
200
+ fn=lambda image, instruction: (chatbot(image, instruction),) + lock_input(image, instruction),
201
+ inputs=[image_input, text_input],
202
+ outputs=[output_component, text_input, image_input]
203
+ )
204
+
205
+ clear_button.click(
206
+ fn=reset_demo,
207
+ inputs=[],
208
+ outputs=[output_component, text_input]
209
+ )
210
+
211
+ demo.queue().launch(share=False)