阳渠 commited on
Commit
1e96bca
1 Parent(s): 8572674

Mobile-Agent-v2

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. MobileAgent/__pycache__/api.cpython-310.pyc +0 -0
  2. MobileAgent/__pycache__/api_service.cpython-310.pyc +0 -0
  3. MobileAgent/__pycache__/chat.cpython-310.pyc +0 -0
  4. MobileAgent/__pycache__/controller.cpython-310.pyc +0 -0
  5. MobileAgent/__pycache__/crop.cpython-310.pyc +0 -0
  6. MobileAgent/__pycache__/icon_localization.cpython-310.pyc +0 -0
  7. MobileAgent/__pycache__/local_server.cpython-310.pyc +0 -0
  8. MobileAgent/__pycache__/prompt.cpython-310.pyc +0 -0
  9. MobileAgent/__pycache__/prompt_no_input.cpython-310.pyc +0 -0
  10. MobileAgent/__pycache__/text_localization.cpython-310.pyc +0 -0
  11. MobileAgent/api.py +45 -0
  12. MobileAgent/api_service.py +26 -0
  13. MobileAgent/chat.py +86 -0
  14. MobileAgent/crop.py +141 -0
  15. MobileAgent/icon_localization.py +59 -0
  16. MobileAgent/local_server.py +172 -0
  17. MobileAgent/prompt_no_input.py +174 -0
  18. MobileAgent/text_localization.py +58 -0
  19. README.md +5 -4
  20. app.py +465 -0
  21. cache/1.png +0 -0
  22. cache/10.png +0 -0
  23. cache/11.png +0 -0
  24. cache/12.png +0 -0
  25. cache/13.png +0 -0
  26. cache/14.png +0 -0
  27. cache/15.png +0 -0
  28. cache/16.png +0 -0
  29. cache/17.png +0 -0
  30. cache/18.png +0 -0
  31. cache/19.png +0 -0
  32. cache/2.png +0 -0
  33. cache/20.png +0 -0
  34. cache/21.png +0 -0
  35. cache/22.png +0 -0
  36. cache/23.png +0 -0
  37. cache/24.png +0 -0
  38. cache/25.png +0 -0
  39. cache/3.png +0 -0
  40. cache/4.png +0 -0
  41. cache/5.png +0 -0
  42. cache/6.png +0 -0
  43. cache/7.png +0 -0
  44. cache/8.png +0 -0
  45. cache/9.png +0 -0
  46. example/example_1.jpg +0 -0
  47. example/example_2.jpg +0 -0
  48. example/example_3.jpg +0 -0
  49. example/example_4.jpg +0 -0
  50. example/example_5.jpg +0 -0
MobileAgent/__pycache__/api.cpython-310.pyc ADDED
Binary file (1.18 kB). View file
 
MobileAgent/__pycache__/api_service.cpython-310.pyc ADDED
Binary file (633 Bytes). View file
 
MobileAgent/__pycache__/chat.cpython-310.pyc ADDED
Binary file (1.92 kB). View file
 
MobileAgent/__pycache__/controller.cpython-310.pyc ADDED
Binary file (4.05 kB). View file
 
MobileAgent/__pycache__/crop.cpython-310.pyc ADDED
Binary file (3.9 kB). View file
 
MobileAgent/__pycache__/icon_localization.cpython-310.pyc ADDED
Binary file (1.77 kB). View file
 
MobileAgent/__pycache__/local_server.cpython-310.pyc ADDED
Binary file (4.25 kB). View file
 
MobileAgent/__pycache__/prompt.cpython-310.pyc ADDED
Binary file (9.8 kB). View file
 
MobileAgent/__pycache__/prompt_no_input.cpython-310.pyc ADDED
Binary file (9.04 kB). View file
 
MobileAgent/__pycache__/text_localization.cpython-310.pyc ADDED
Binary file (1.98 kB). View file
 
MobileAgent/api.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import base64
2
+ import requests
3
+
4
+ def encode_image(image_path):
5
+ with open(image_path, "rb") as image_file:
6
+ return base64.b64encode(image_file.read()).decode('utf-8')
7
+
8
+
9
+ def inference_chat(chat, model, api_url, token):
10
+ headers = {
11
+ "Content-Type": "application/json",
12
+ "Authorization": f"Bearer {token}"
13
+ }
14
+
15
+ data = {
16
+ "model": model,
17
+ "messages": [],
18
+ "max_tokens": 2048,
19
+ 'temperature': 0.0,
20
+ "seed": 1234
21
+ }
22
+
23
+ for role, content in chat:
24
+ data["messages"].append({"role": role, "content": content})
25
+
26
+ retry = 3
27
+ cur_try = 0
28
+ while True:
29
+ cur_try += 1
30
+ if cur_try > retry:
31
+ return "No token"
32
+ try:
33
+ res = requests.post(api_url, headers=headers, json=data)
34
+ res_json = res.json()
35
+ res_content = res_json['data']['response']['choices'][0]['message']['content']
36
+ except:
37
+ print("Network Error:")
38
+ try:
39
+ print(res.json())
40
+ except:
41
+ print("Request Failed")
42
+ else:
43
+ break
44
+
45
+ return res_content
MobileAgent/api_service.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ import json
3
+
4
+
5
+ def get_action(query_data, url, token):
6
+
7
+ headers = {
8
+ 'Authorization': token,
9
+ 'Content-Type': 'application/json'
10
+ }
11
+
12
+ data = {
13
+ "model": "pre-Mobile_Agent_Server_ADB_V2-2204",
14
+ "input": {"json_data": query_data}
15
+ }
16
+
17
+ while True:
18
+ try:
19
+ response = requests.post(url, headers=headers, data=json.dumps(data))
20
+ response.json()["output"]
21
+ except:
22
+ print("Network Error:", response.json())
23
+ else:
24
+ break
25
+
26
+ return response
MobileAgent/chat.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import copy
2
+ from MobileAgent.api import encode_image
3
+
4
+
5
+ def init_action_chat():
6
+ operation_history = []
7
+ sysetm_prompt = "You are a helpful AI mobile phone operating assistant. You need to help me operate the phone to complete the user\'s instruction."
8
+ operation_history.append(["system", [{"type": "text", "text": sysetm_prompt}]])
9
+ return operation_history
10
+
11
+
12
+ def init_reflect_chat():
13
+ operation_history = []
14
+ sysetm_prompt = "You are a helpful AI mobile phone operating assistant."
15
+ operation_history.append(["system", [{"type": "text", "text": sysetm_prompt}]])
16
+ return operation_history
17
+
18
+
19
+ def init_memory_chat():
20
+ operation_history = []
21
+ sysetm_prompt = "You are a helpful AI mobile phone operating assistant."
22
+ operation_history.append(["system", [{"type": "text", "text": sysetm_prompt}]])
23
+ return operation_history
24
+
25
+
26
+ def add_response(role, prompt, chat_history, image=None):
27
+ new_chat_history = copy.deepcopy(chat_history)
28
+ if image:
29
+ base64_image = encode_image(image)
30
+ content = [
31
+ {
32
+ "type": "text",
33
+ "text": prompt
34
+ },
35
+ {
36
+ "type": "image_url",
37
+ "image_url": {
38
+ "url": f"data:image/jpeg;base64,{base64_image}"
39
+ }
40
+ },
41
+ ]
42
+ else:
43
+ content = [
44
+ {
45
+ "type": "text",
46
+ "text": prompt
47
+ },
48
+ ]
49
+ new_chat_history.append([role, content])
50
+ return new_chat_history
51
+
52
+
53
+ def add_response_two_image(role, prompt, chat_history, image):
54
+ new_chat_history = copy.deepcopy(chat_history)
55
+
56
+ base64_image1 = encode_image(image[0])
57
+ base64_image2 = encode_image(image[1])
58
+ content = [
59
+ {
60
+ "type": "text",
61
+ "text": prompt
62
+ },
63
+ {
64
+ "type": "image_url",
65
+ "image_url": {
66
+ "url": f"data:image/jpeg;base64,{base64_image1}"
67
+ }
68
+ },
69
+ {
70
+ "type": "image_url",
71
+ "image_url": {
72
+ "url": f"data:image/jpeg;base64,{base64_image2}"
73
+ }
74
+ },
75
+ ]
76
+
77
+ new_chat_history.append([role, content])
78
+ return new_chat_history
79
+
80
+
81
+ def print_status(chat_history):
82
+ print("*"*100)
83
+ for chat in chat_history:
84
+ print("role:", chat[0])
85
+ print(chat[1][0]["text"] + "<image>"*(len(chat[1])-1) + "\n")
86
+ print("*"*100)
MobileAgent/crop.py ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import cv2
3
+ import numpy as np
4
+ from PIL import Image, ImageDraw, ImageFont
5
+ import clip
6
+ import torch
7
+
8
+
9
+ def crop_image(img, position):
10
+ def distance(x1,y1,x2,y2):
11
+ return math.sqrt(pow(x1 - x2, 2) + pow(y1 - y2, 2))
12
+ position = position.tolist()
13
+ for i in range(4):
14
+ for j in range(i+1, 4):
15
+ if(position[i][0] > position[j][0]):
16
+ tmp = position[j]
17
+ position[j] = position[i]
18
+ position[i] = tmp
19
+ if position[0][1] > position[1][1]:
20
+ tmp = position[0]
21
+ position[0] = position[1]
22
+ position[1] = tmp
23
+
24
+ if position[2][1] > position[3][1]:
25
+ tmp = position[2]
26
+ position[2] = position[3]
27
+ position[3] = tmp
28
+
29
+ x1, y1 = position[0][0], position[0][1]
30
+ x2, y2 = position[2][0], position[2][1]
31
+ x3, y3 = position[3][0], position[3][1]
32
+ x4, y4 = position[1][0], position[1][1]
33
+
34
+ corners = np.zeros((4,2), np.float32)
35
+ corners[0] = [x1, y1]
36
+ corners[1] = [x2, y2]
37
+ corners[2] = [x4, y4]
38
+ corners[3] = [x3, y3]
39
+
40
+ img_width = distance((x1+x4)/2, (y1+y4)/2, (x2+x3)/2, (y2+y3)/2)
41
+ img_height = distance((x1+x2)/2, (y1+y2)/2, (x4+x3)/2, (y4+y3)/2)
42
+
43
+ corners_trans = np.zeros((4,2), np.float32)
44
+ corners_trans[0] = [0, 0]
45
+ corners_trans[1] = [img_width - 1, 0]
46
+ corners_trans[2] = [0, img_height - 1]
47
+ corners_trans[3] = [img_width - 1, img_height - 1]
48
+
49
+ transform = cv2.getPerspectiveTransform(corners, corners_trans)
50
+ dst = cv2.warpPerspective(img, transform, (int(img_width), int(img_height)))
51
+ return dst
52
+
53
+
54
+ def calculate_size(box):
55
+ return (box[2]-box[0]) * (box[3]-box[1])
56
+
57
+
58
+ def calculate_iou(box1, box2):
59
+ xA = max(box1[0], box2[0])
60
+ yA = max(box1[1], box2[1])
61
+ xB = min(box1[2], box2[2])
62
+ yB = min(box1[3], box2[3])
63
+
64
+ interArea = max(0, xB - xA) * max(0, yB - yA)
65
+ box1Area = (box1[2] - box1[0]) * (box1[3] - box1[1])
66
+ box2Area = (box2[2] - box2[0]) * (box2[3] - box2[1])
67
+ unionArea = box1Area + box2Area - interArea
68
+ iou = interArea / unionArea
69
+
70
+ return iou
71
+
72
+
73
+ def crop(image, box, i, text_data=None):
74
+ image = Image.open(image)
75
+
76
+ if text_data:
77
+ draw = ImageDraw.Draw(image)
78
+ draw.rectangle(((text_data[0], text_data[1]), (text_data[2], text_data[3])), outline="red", width=5)
79
+ # font_size = int((text_data[3] - text_data[1])*0.75)
80
+ # font = ImageFont.truetype("arial.ttf", font_size)
81
+ # draw.text((text_data[0]+5, text_data[1]+5), str(i), font=font, fill="red")
82
+
83
+ cropped_image = image.crop(box)
84
+ cropped_image.save(f"./temp/{i}.jpg")
85
+
86
+
87
+ def in_box(box, target):
88
+ if (box[0] > target[0]) and (box[1] > target[1]) and (box[2] < target[2]) and (box[3] < target[3]):
89
+ return True
90
+ else:
91
+ return False
92
+
93
+
94
+ def crop_for_clip(image, box, i, position):
95
+ image = Image.open(image)
96
+ w, h = image.size
97
+ if position == "left":
98
+ bound = [0, 0, w/2, h]
99
+ elif position == "right":
100
+ bound = [w/2, 0, w, h]
101
+ elif position == "top":
102
+ bound = [0, 0, w, h/2]
103
+ elif position == "bottom":
104
+ bound = [0, h/2, w, h]
105
+ elif position == "top left":
106
+ bound = [0, 0, w/2, h/2]
107
+ elif position == "top right":
108
+ bound = [w/2, 0, w, h/2]
109
+ elif position == "bottom left":
110
+ bound = [0, h/2, w/2, h]
111
+ elif position == "bottom right":
112
+ bound = [w/2, h/2, w, h]
113
+ else:
114
+ bound = [0, 0, w, h]
115
+
116
+ if in_box(box, bound):
117
+ cropped_image = image.crop(box)
118
+ cropped_image.save(f"./temp/{i}.jpg")
119
+ return True
120
+ else:
121
+ return False
122
+
123
+
124
+ def clip_for_icon(clip_model, clip_preprocess, images, prompt):
125
+ image_features = []
126
+ for image_file in images:
127
+ image = clip_preprocess(Image.open(image_file)).unsqueeze(0).to(next(clip_model.parameters()).device)
128
+ image_feature = clip_model.encode_image(image)
129
+ image_features.append(image_feature)
130
+ image_features = torch.cat(image_features)
131
+
132
+ text = clip.tokenize([prompt]).to(next(clip_model.parameters()).device)
133
+ text_features = clip_model.encode_text(text)
134
+
135
+ image_features /= image_features.norm(dim=-1, keepdim=True)
136
+ text_features /= text_features.norm(dim=-1, keepdim=True)
137
+ similarity = (100.0 * image_features @ text_features.T).softmax(dim=0).squeeze(0)
138
+ _, max_pos = torch.max(similarity, dim=0)
139
+ pos = max_pos.item()
140
+
141
+ return pos
MobileAgent/icon_localization.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from MobileAgent.crop import calculate_size, calculate_iou
2
+ from modelscope.pipelines import pipeline
3
+ from PIL import Image
4
+ import torch
5
+
6
+ def remove_boxes(boxes_filt, size, iou_threshold=0.5):
7
+ boxes_to_remove = set()
8
+
9
+ for i in range(len(boxes_filt)):
10
+ if calculate_size(boxes_filt[i]) > 0.05*size[0]*size[1]:
11
+ boxes_to_remove.add(i)
12
+ for j in range(len(boxes_filt)):
13
+ if calculate_size(boxes_filt[j]) > 0.05*size[0]*size[1]:
14
+ boxes_to_remove.add(j)
15
+ if i == j:
16
+ continue
17
+ if i in boxes_to_remove or j in boxes_to_remove:
18
+ continue
19
+ iou = calculate_iou(boxes_filt[i], boxes_filt[j])
20
+ if iou >= iou_threshold:
21
+ boxes_to_remove.add(j)
22
+
23
+ boxes_filt = [box for idx, box in enumerate(boxes_filt) if idx not in boxes_to_remove]
24
+
25
+ return boxes_filt
26
+
27
+
28
+ def det(input_image_path, caption, groundingdino_model, box_threshold=0.05, text_threshold=0.5):
29
+ image = Image.open(input_image_path)
30
+ size = image.size
31
+
32
+ caption = caption.lower()
33
+ caption = caption.strip()
34
+ if not caption.endswith('.'):
35
+ caption = caption + '.'
36
+
37
+ inputs = {
38
+ 'IMAGE_PATH': input_image_path,
39
+ 'TEXT_PROMPT': caption,
40
+ 'BOX_TRESHOLD': box_threshold,
41
+ 'TEXT_TRESHOLD': text_threshold
42
+ }
43
+
44
+ result = groundingdino_model(inputs)
45
+ boxes_filt = result['boxes']
46
+
47
+ H, W = size[1], size[0]
48
+ for i in range(boxes_filt.size(0)):
49
+ boxes_filt[i] = boxes_filt[i] * torch.Tensor([W, H, W, H])
50
+ boxes_filt[i][:2] -= boxes_filt[i][2:] / 2
51
+ boxes_filt[i][2:] += boxes_filt[i][:2]
52
+
53
+ boxes_filt = boxes_filt.cpu().int().tolist()
54
+ filtered_boxes = remove_boxes(boxes_filt, size) # [:9]
55
+ coordinates = []
56
+ for box in filtered_boxes:
57
+ coordinates.append([box[0], box[1], box[2], box[3]])
58
+
59
+ return coordinates
MobileAgent/local_server.py ADDED
@@ -0,0 +1,172 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import base64
3
+ from io import BytesIO
4
+ from PIL import Image
5
+
6
+ from MobileAgent.api import inference_chat
7
+ from MobileAgent.prompt_no_input import get_action_prompt, get_reflect_prompt, get_memory_prompt, get_process_prompt
8
+ from MobileAgent.chat import init_action_chat, init_reflect_chat, init_memory_chat, add_response, add_response_two_image
9
+
10
+ from dashscope import MultiModalConversation
11
+ import dashscope
12
+ import concurrent
13
+
14
+
15
+ API_url = os.environ.get('url')
16
+ token = os.environ.get('token')
17
+
18
+
19
+ def base64_to_pil(base64_string):
20
+ if base64_string.startswith('data:image'):
21
+ base64_string = base64_string.split(',')[-1]
22
+ image_data = base64.b64decode(base64_string)
23
+ image_stream = BytesIO(image_data)
24
+ pil_image = Image.open(image_stream)
25
+ return pil_image
26
+
27
+
28
+ def process_image(image, query):
29
+ dashscope.api_key = os.environ.get('qwen')
30
+ image = "file://" + image
31
+ messages = [{
32
+ 'role': 'user',
33
+ 'content': [
34
+ {
35
+ 'image': image
36
+ },
37
+ {
38
+ 'text': query
39
+ },
40
+ ]
41
+ }]
42
+ response = MultiModalConversation.call(model="qwen-vl-plus", messages=messages)
43
+
44
+ try:
45
+ response = response['output']['choices'][0]['message']['content'][0]["text"]
46
+ except:
47
+ response = "This is an icon."
48
+
49
+ return response
50
+
51
+
52
+ if not os.path.exists("screenshot"):
53
+ os.mkdir("screenshot")
54
+ if not os.path.exists("temp"):
55
+ os.mkdir("temp")
56
+
57
+
58
+ def mobile_agent_infer(json_data):
59
+ task = json_data["task"]
60
+ if task == "caption":
61
+ query = json_data["query"]
62
+ images = json_data["images"]
63
+ local_images = []
64
+ for image in images:
65
+ image_name = image["image_name"]
66
+ image_file = image["image_file"]
67
+ image_file = base64_to_pil(image_file)
68
+ image_path = "temp/" + image_name
69
+ image_file.save(image_path, "PNG")
70
+ local_images.append(image_path)
71
+
72
+ icon_map = {}
73
+ with concurrent.futures.ThreadPoolExecutor() as executor:
74
+ futures = {executor.submit(process_image, image, query): i for i, image in enumerate(local_images)}
75
+
76
+ for future in concurrent.futures.as_completed(futures):
77
+ i = futures[future]
78
+ response = future.result()
79
+ icon_map[i + 1] = response
80
+
81
+ output = {"icon_map": icon_map}
82
+ return output
83
+
84
+ elif task == "planning":
85
+ instruction = json_data["instruction"]
86
+ thought_history = json_data["thought_history"]
87
+ summary_history = json_data["summary_history"]
88
+ action_history = json_data["action_history"]
89
+ completed_requirements = json_data["completed_requirements"]
90
+ add_info = json_data["add_info"]
91
+
92
+ prompt_planning = get_process_prompt(instruction, thought_history, summary_history, action_history,
93
+ completed_requirements, add_info)
94
+ chat_planning = init_memory_chat()
95
+ chat_planning = add_response("user", prompt_planning, chat_planning)
96
+ output_planning = inference_chat(chat_planning, 'gpt-4-turbo', API_url, token)
97
+
98
+ output = {"planning": output_planning}
99
+ return output
100
+
101
+ elif task == "decision":
102
+ screenshot_file = json_data["screenshot_file"]
103
+ screenshot_file = base64_to_pil(screenshot_file)
104
+ image_path = "screenshot/screenshot_local.png"
105
+ screenshot_file.save(image_path, "PNG")
106
+
107
+ instruction = json_data["instruction"]
108
+ perception_infos = json_data["perception_infos"]
109
+ width = json_data["width"]
110
+ height = json_data["height"]
111
+ summary_history = json_data["summary_history"]
112
+ action_history = json_data["action_history"]
113
+ summary = json_data["summary"]
114
+ action = json_data["action"]
115
+ add_info = json_data["add_info"]
116
+ error_flag = json_data["error_flag"]
117
+ completed_requirements = json_data["completed_requirements"]
118
+ memory = json_data["memory"]
119
+ memory_switch = json_data["memory_switch"]
120
+ insight = json_data["insight"]
121
+
122
+ prompt_action = get_action_prompt(instruction, perception_infos, width, height, summary_history,
123
+ action_history, summary, action, add_info, error_flag, completed_requirements,
124
+ memory)
125
+ chat_action = init_action_chat()
126
+ chat_action = add_response("user", prompt_action, chat_action, image_path)
127
+ output_action = inference_chat(chat_action, 'gpt-4o', API_url, token)
128
+ if output_action == "No token":
129
+ output = {"decision": "No token", "memory": None}
130
+ return output
131
+ chat_action = add_response("assistant", output_action, chat_action)
132
+
133
+ output_memory = None
134
+ if memory_switch:
135
+ prompt_memory = get_memory_prompt(insight)
136
+ chat_action = add_response("user", prompt_memory, chat_action)
137
+ output_memory = inference_chat(chat_action, 'gpt-4o', API_url, token)
138
+
139
+ output = {"decision": output_action, "memory": output_memory}
140
+ return output
141
+
142
+ elif task == "reflection":
143
+ screenshot_file = json_data["screenshot_file"]
144
+ screenshot_file = base64_to_pil(screenshot_file)
145
+ image_path = "screenshot/screenshot_local.png"
146
+ screenshot_file.save(image_path, "PNG")
147
+ last_screenshot_file = json_data["last_screenshot_file"]
148
+ last_screenshot_file = base64_to_pil(last_screenshot_file)
149
+ last_image_path = "screenshot/last_screenshot_local.png"
150
+ last_screenshot_file.save(last_image_path, "PNG")
151
+
152
+ instruction = json_data["instruction"]
153
+ last_perception_infos = json_data["last_perception_infos"]
154
+ perception_infos = json_data["perception_infos"]
155
+ width = json_data["width"]
156
+ height = json_data["height"]
157
+ summary = json_data["summary"]
158
+ action = json_data["action"]
159
+ add_info = json_data["add_info"]
160
+
161
+ prompt_reflect = get_reflect_prompt(instruction, last_perception_infos, perception_infos, width, height,
162
+ summary, action, add_info)
163
+ chat_reflect = init_reflect_chat()
164
+ chat_reflect = add_response_two_image("user", prompt_reflect, chat_reflect, [last_image_path, image_path])
165
+ output_reflect = inference_chat(chat_reflect, 'gpt-4o', API_url, token)
166
+
167
+ output = {"reflection": output_reflect}
168
+ return output
169
+
170
+ else:
171
+ output = {"error": "The task must be in \"caption\", \"planning\", \"decision\" and \"reflection\"."}
172
+ return output
MobileAgent/prompt_no_input.py ADDED
@@ -0,0 +1,174 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ def get_action_prompt(instruction, clickable_infos, width, height, summary_history, action_history, last_summary, last_action, add_info, error_flag, completed_content, memory):
2
+ prompt = "### Background ###\n"
3
+ prompt += f"This image is a phone screenshot. Its width is {width} pixels and its height is {height} pixels. The user\'s instruction is: {instruction}.\n\n"
4
+
5
+ prompt += "### Screenshot information ###\n"
6
+ prompt += "In order to help you better perceive the content in this screenshot, we extract some information on the current screenshot through system files. "
7
+ prompt += "This information consists of two parts: coordinates; content. "
8
+ prompt += "The format of the coordinates is [x, y], x is the pixel from left to right and y is the pixel from top to bottom; the content is a text or an icon description respectively. "
9
+ prompt += "The information is as follow:\n"
10
+
11
+ for clickable_info in clickable_infos:
12
+ if clickable_info['text'] != "" and clickable_info['text'] != "icon: None" and clickable_info['coordinates'] != (0, 0):
13
+ prompt += f"{clickable_info['coordinates']}; {clickable_info['text']}\n"
14
+
15
+ prompt += "Please note that this information is not necessarily accurate. You need to combine the screenshot to understand."
16
+ prompt += "\n\n"
17
+
18
+ if add_info != "":
19
+ prompt += "### Hint ###\n"
20
+ prompt += "There are hints to help you complete the user\'s instructions. The hints are as follow:\n"
21
+ prompt += add_info
22
+ prompt += "\n\n"
23
+
24
+ if len(action_history) > 0:
25
+ prompt += "### History operations ###\n"
26
+ prompt += "Before reaching this page, some operations have been completed. You need to refer to the completed operations to decide the next operation. These operations are as follow:\n"
27
+ for i in range(len(action_history)):
28
+ prompt += f"Step-{i+1}: [Operation: " + summary_history[i].split(" to ")[0].strip() + "; Action: " + action_history[i] + "]\n"
29
+ prompt += "\n"
30
+
31
+ if completed_content != "":
32
+ prompt += "### Progress ###\n"
33
+ prompt += "After completing the history operations, you have the following thoughts about the progress of user\'s instruction completion:\n"
34
+ prompt += "Completed contents:\n" + completed_content + "\n\n"
35
+
36
+ if memory != "":
37
+ prompt += "### Memory ###\n"
38
+ prompt += "During the operations, you record the following contents on the screenshot for use in subsequent operations:\n"
39
+ prompt += "Memory:\n" + memory + "\n"
40
+
41
+ if error_flag:
42
+ prompt += "### Last operation ###\n"
43
+ prompt += f"You previously wanted to perform the operation \"{last_summary}\" on this page and executed the Action \"{last_action}\". But you find that this operation does not meet your expectation. You need to reflect and revise your operation this time."
44
+ prompt += "\n\n"
45
+
46
+ prompt += "### Response requirements ###\n"
47
+ prompt += "Now you need to combine all of the above to perform just one action on the current page. You must choose one of the six actions below:\n"
48
+ prompt += "Open app (app name): If the current page is desktop, you can use this action to open the app named \"app name\" on the desktop.\n"
49
+ prompt += "Tap (x, y): Tap the position (x, y) in current page.\n"
50
+ prompt += "Swipe (x1, y1), (x2, y2): Swipe from position (x1, y1) to position (x2, y2).\n"
51
+ prompt += "Type (text): Type the \"text\" in the input box.\n"
52
+ prompt += "Home: Return to home page.\n"
53
+ prompt += "Stop: If you think all the requirements of user\'s instruction have been completed and no further operation is required, you can choose this action to terminate the operation process."
54
+ prompt += "\n\n"
55
+
56
+ prompt += "### Output format ###\n"
57
+ prompt += "Your output consists of the following three parts:\n"
58
+ prompt += "### Thought ###\nThink about the requirements that have been completed in previous operations and the requirements that need to be completed in the next one operation.\n"
59
+ prompt += "### Action ###\nYou can only choose one from the six actions above. Make sure that the coordinates or text in the \"()\".\n"
60
+ prompt += "### Operation ###\nPlease generate a brief natural language description for the operation in Action based on your Thought."
61
+
62
+ return prompt
63
+
64
+
65
+ def get_reflect_prompt(instruction, clickable_infos1, clickable_infos2, width, height, summary, action, add_info):
66
+ prompt = f"These images are two phone screenshots before and after an operation. Their widths are {width} pixels and their heights are {height} pixels.\n\n"
67
+
68
+ prompt += "In order to help you better perceive the content in this screenshot, we extract some information on the current screenshot through system files. "
69
+ prompt += "The information consists of two parts, consisting of format: coordinates; content. "
70
+ prompt += "The format of the coordinates is [x, y], x is the pixel from left to right and y is the pixel from top to bottom; the content is a text or an icon description respectively "
71
+ prompt += "The keyboard status is whether the keyboard of the current page is activated."
72
+ prompt += "\n\n"
73
+
74
+ prompt += "### Before the current operation ###\n"
75
+ prompt += "Screenshot information:\n"
76
+ for clickable_info in clickable_infos1:
77
+ if clickable_info['text'] != "" and clickable_info['text'] != "icon: None" and clickable_info['coordinates'] != (0, 0):
78
+ prompt += f"{clickable_info['coordinates']}; {clickable_info['text']}\n"
79
+ prompt += "\n"
80
+
81
+ prompt += "### After the current operation ###\n"
82
+ prompt += "Screenshot information:\n"
83
+ for clickable_info in clickable_infos2:
84
+ if clickable_info['text'] != "" and clickable_info['text'] != "icon: None" and clickable_info['coordinates'] != (0, 0):
85
+ prompt += f"{clickable_info['coordinates']}; {clickable_info['text']}\n"
86
+ prompt += "\n"
87
+
88
+ prompt += "### Current operation ###\n"
89
+ prompt += f"The user\'s instruction is: {instruction}. You also need to note the following requirements: {add_info}. In the process of completing the requirements of instruction, an operation is performed on the phone. Below are the details of this operation:\n"
90
+ prompt += "Operation thought: " + summary.split(" to ")[0].strip() + "\n"
91
+ prompt += "Operation action: " + action
92
+ prompt += "\n\n"
93
+
94
+ prompt += "### Response requirements ###\n"
95
+ prompt += "Now you need to output the following content based on the screenshots before and after the current operation:\n"
96
+ prompt += "Whether the result of the \"Operation action\" meets your expectation of \"Operation thought\"?\n"
97
+ prompt += "A: The result of the \"Operation action\" meets my expectation of \"Operation thought\".\n"
98
+ prompt += "B: The \"Operation action\" results in a wrong page and I need to return to the previous page.\n"
99
+ prompt += "C: The \"Operation action\" produces no changes."
100
+ prompt += "\n\n"
101
+
102
+ prompt += "### Output format ###\n"
103
+ prompt += "Your output format is:\n"
104
+ prompt += "### Thought ###\nYour thought about the question\n"
105
+ prompt += "### Answer ###\nA or B or C"
106
+
107
+ return prompt
108
+
109
+
110
+ def get_memory_prompt(insight):
111
+ if insight != "":
112
+ prompt = "### Important content ###\n"
113
+ prompt += insight
114
+ prompt += "\n\n"
115
+
116
+ prompt += "### Response requirements ###\n"
117
+ prompt += "Please think about whether there is any content closely related to ### Important content ### on the current page? If there is, please output the content. If not, please output \"None\".\n\n"
118
+
119
+ else:
120
+ prompt = "### Response requirements ###\n"
121
+ prompt += "Please think about whether there is any content closely related to user\'s instrcution on the current page? If there is, please output the content. If not, please output \"None\".\n\n"
122
+
123
+ prompt += "### Output format ###\n"
124
+ prompt += "Your output format is:\n"
125
+ prompt += "### Important content ###\nThe content or None. Please do not repeatedly output the information in ### Memory ###."
126
+
127
+ return prompt
128
+
129
+ def get_process_prompt(instruction, thought_history, summary_history, action_history, completed_content, add_info):
130
+ prompt = "### Background ###\n"
131
+ prompt += f"There is an user\'s instruction which is: {instruction}. You are a mobile phone operating assistant and are operating the user\'s mobile phone.\n\n"
132
+
133
+ if add_info != "":
134
+ prompt += "### Hint ###\n"
135
+ prompt += "There are hints to help you complete the user\'s instructions. The hints are as follow:\n"
136
+ prompt += add_info
137
+ prompt += "\n\n"
138
+
139
+ if len(thought_history) > 1:
140
+ prompt += "### History operations ###\n"
141
+ prompt += "To complete the requirements of user\'s instruction, you have performed a series of operations. These operations are as follow:\n"
142
+ for i in range(len(summary_history)):
143
+ operation = summary_history[i].split(" to ")[0].strip()
144
+ prompt += f"Step-{i+1}: [Operation thought: " + operation + "; Operation action: " + action_history[i] + "]\n"
145
+ prompt += "\n"
146
+
147
+ prompt += "### Progress thinking ###\n"
148
+ prompt += "After completing the history operations, you have the following thoughts about the progress of user\'s instruction completion:\n"
149
+ prompt += "Completed contents:\n" + completed_content + "\n\n"
150
+
151
+ prompt += "### Response requirements ###\n"
152
+ prompt += "Now you need to update the \"Completed contents\". Completed contents is a general summary of the current contents that have been completed based on the ### History operations ###.\n\n"
153
+
154
+ prompt += "### Output format ###\n"
155
+ prompt += "Your output format is:\n"
156
+ prompt += "### Completed contents ###\nUpdated Completed contents. Don\'t output the purpose of any operation. Just summarize the contents that have been actually completed in the ### History operations ###."
157
+
158
+ else:
159
+ prompt += "### Current operation ###\n"
160
+ prompt += "To complete the requirements of user\'s instruction, you have performed an operation. Your operation thought and action of this operation are as follows:\n"
161
+ prompt += f"Operation thought: {thought_history[-1]}\n"
162
+ operation = summary_history[-1].split(" to ")[0].strip()
163
+ prompt += f"Operation action: {operation}\n\n"
164
+
165
+ prompt += "### Response requirements ###\n"
166
+ prompt += "Now you need to combine all of the above to generate the \"Completed contents\".\n"
167
+ prompt += "Completed contents is a general summary of the current contents that have been completed. You need to first focus on the requirements of user\'s instruction, and then summarize the contents that have been completed.\n\n"
168
+
169
+ prompt += "### Output format ###\n"
170
+ prompt += "Your output format is:\n"
171
+ prompt += "### Completed contents ###\nGenerated Completed contents. Don\'t output the purpose of any operation. Just summarize the contents that have been actually completed in the ### Current operation ###.\n"
172
+ prompt += "(Please use English to output)"
173
+
174
+ return prompt
MobileAgent/text_localization.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import cv2
2
+ import numpy as np
3
+ from MobileAgent.crop import crop_image, calculate_size
4
+ from PIL import Image
5
+
6
+
7
+ def order_point(coor):
8
+ arr = np.array(coor).reshape([4, 2])
9
+ sum_ = np.sum(arr, 0)
10
+ centroid = sum_ / arr.shape[0]
11
+ theta = np.arctan2(arr[:, 1] - centroid[1], arr[:, 0] - centroid[0])
12
+ sort_points = arr[np.argsort(theta)]
13
+ sort_points = sort_points.reshape([4, -1])
14
+ if sort_points[0][0] > centroid[0]:
15
+ sort_points = np.concatenate([sort_points[3:], sort_points[:3]])
16
+ sort_points = sort_points.reshape([4, 2]).astype('float32')
17
+ return sort_points
18
+
19
+
20
+ def longest_common_substring_length(str1, str2):
21
+ m = len(str1)
22
+ n = len(str2)
23
+ dp = [[0] * (n + 1) for _ in range(m + 1)]
24
+
25
+ for i in range(1, m + 1):
26
+ for j in range(1, n + 1):
27
+ if str1[i - 1] == str2[j - 1]:
28
+ dp[i][j] = dp[i - 1][j - 1] + 1
29
+ else:
30
+ dp[i][j] = max(dp[i - 1][j], dp[i][j - 1])
31
+
32
+ return dp[m][n]
33
+
34
+
35
+ def ocr(image_path, ocr_detection, ocr_recognition):
36
+ text_data = []
37
+ coordinate = []
38
+
39
+ image_full = cv2.imread(image_path)
40
+ det_result = ocr_detection(image_full)
41
+ det_result = det_result['polygons']
42
+ for i in range(det_result.shape[0]):
43
+ pts = order_point(det_result[i])
44
+ image_crop = crop_image(image_full, pts)
45
+
46
+ try:
47
+ result = ocr_recognition(image_crop)['text'][0]
48
+ except:
49
+ continue
50
+
51
+ box = [int(e) for e in list(pts.reshape(-1))]
52
+ box = [box[0], box[1], box[4], box[5]]
53
+
54
+ text_data.append(result)
55
+ coordinate.append(box)
56
+
57
+ else:
58
+ return text_data, coordinate
README.md CHANGED
@@ -1,12 +1,13 @@
1
  ---
2
  title: Mobile Agent
3
- emoji: 😻
4
- colorFrom: blue
5
- colorTo: red
6
  sdk: gradio
7
- sdk_version: 4.37.1
8
  app_file: app.py
9
  pinned: false
 
10
  ---
11
 
12
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
  title: Mobile Agent
3
+ emoji: 🦀
4
+ colorFrom: indigo
5
+ colorTo: green
6
  sdk: gradio
7
+ sdk_version: 4.19.1
8
  app_file: app.py
9
  pinned: false
10
+ license: mit
11
  ---
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,465 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import io
2
+ import os
3
+ import shutil
4
+ import base64
5
+ import gradio as gr
6
+ from PIL import Image, ImageDraw
7
+
8
+ from MobileAgent.text_localization import ocr
9
+ from MobileAgent.icon_localization import det
10
+ from MobileAgent.local_server import mobile_agent_infer
11
+
12
+ from modelscope import snapshot_download
13
+ from modelscope.pipelines import pipeline
14
+ from modelscope.utils.constant import Tasks
15
+
16
+
17
+ chatbot_css = """
18
+ <style>
19
+ .chat-container {
20
+ display: flex;
21
+ flex-direction: column;
22
+ overflow-y: auto;
23
+ max-height: 630px;
24
+ margin: 10px;
25
+ }
26
+ .user-message, .bot-message {
27
+ margin: 5px;
28
+ padding: 10px;
29
+ border-radius: 10px;
30
+ }
31
+ .user-message {
32
+ text-align: right;
33
+ background-color: #7B68EE;
34
+ color: white;
35
+ align-self: flex-end;
36
+ }
37
+ .bot-message {
38
+ text-align: left;
39
+ background-color: #ADD8E6;
40
+ color: black;
41
+ align-self: flex-start;
42
+ }
43
+ .user-image {
44
+ text-align: right;
45
+ align-self: flex-end;
46
+ max-width: 150px;
47
+ max-height: 300px;
48
+ }
49
+ .bot-image {
50
+ text-align: left;
51
+ align-self: flex-start;
52
+ max-width: 200px;
53
+ max-height: 400px;
54
+ }
55
+ </style>
56
+ """
57
+
58
+
59
+ temp_file = "temp"
60
+ screenshot = "screenshot"
61
+ cache = "cache"
62
+ if not os.path.exists(temp_file):
63
+ os.mkdir(temp_file)
64
+ if not os.path.exists(screenshot):
65
+ os.mkdir(screenshot)
66
+ if not os.path.exists(cache):
67
+ os.mkdir(cache)
68
+
69
+
70
+ groundingdino_dir = snapshot_download('AI-ModelScope/GroundingDINO', revision='v1.0.0')
71
+ groundingdino_model = pipeline('grounding-dino-task', model=groundingdino_dir)
72
+ ocr_detection = pipeline(Tasks.ocr_detection, model='damo/cv_resnet18_ocr-detection-line-level_damo')
73
+ ocr_recognition = pipeline(Tasks.ocr_recognition, model='damo/cv_convnextTiny_ocr-recognition-document_damo')
74
+
75
+
76
+ def encode_image(image_path):
77
+ with open(image_path, "rb") as image_file:
78
+ return base64.b64encode(image_file.read()).decode('utf-8')
79
+
80
+
81
+ def get_all_files_in_folder(folder_path):
82
+ file_list = []
83
+ for file_name in os.listdir(folder_path):
84
+ file_list.append(file_name)
85
+ return file_list
86
+
87
+
88
+ def crop(image, box, i):
89
+ image = Image.open(image)
90
+ x1, y1, x2, y2 = int(box[0]), int(box[1]), int(box[2]), int(box[3])
91
+ if x1 >= x2-10 or y1 >= y2-10:
92
+ return
93
+ cropped_image = image.crop((x1, y1, x2, y2))
94
+ cropped_image.save(f"./temp/{i}.png", format="PNG")
95
+
96
+
97
+ def merge_text_blocks(text_list, coordinates_list):
98
+ merged_text_blocks = []
99
+ merged_coordinates = []
100
+
101
+ sorted_indices = sorted(range(len(coordinates_list)), key=lambda k: (coordinates_list[k][1], coordinates_list[k][0]))
102
+ sorted_text_list = [text_list[i] for i in sorted_indices]
103
+ sorted_coordinates_list = [coordinates_list[i] for i in sorted_indices]
104
+
105
+ num_blocks = len(sorted_text_list)
106
+ merge = [False] * num_blocks
107
+
108
+ for i in range(num_blocks):
109
+ if merge[i]:
110
+ continue
111
+
112
+ anchor = i
113
+
114
+ group_text = [sorted_text_list[anchor]]
115
+ group_coordinates = [sorted_coordinates_list[anchor]]
116
+
117
+ for j in range(i+1, num_blocks):
118
+ if merge[j]:
119
+ continue
120
+
121
+ if abs(sorted_coordinates_list[anchor][0] - sorted_coordinates_list[j][0]) < 10 and \
122
+ sorted_coordinates_list[j][1] - sorted_coordinates_list[anchor][3] >= -10 and sorted_coordinates_list[j][1] - sorted_coordinates_list[anchor][3] < 30 and \
123
+ abs(sorted_coordinates_list[anchor][3] - sorted_coordinates_list[anchor][1] - (sorted_coordinates_list[j][3] - sorted_coordinates_list[j][1])) < 10:
124
+ group_text.append(sorted_text_list[j])
125
+ group_coordinates.append(sorted_coordinates_list[j])
126
+ merge[anchor] = True
127
+ anchor = j
128
+ merge[anchor] = True
129
+
130
+ merged_text = "\n".join(group_text)
131
+ min_x1 = min(group_coordinates, key=lambda x: x[0])[0]
132
+ min_y1 = min(group_coordinates, key=lambda x: x[1])[1]
133
+ max_x2 = max(group_coordinates, key=lambda x: x[2])[2]
134
+ max_y2 = max(group_coordinates, key=lambda x: x[3])[3]
135
+
136
+ merged_text_blocks.append(merged_text)
137
+ merged_coordinates.append([min_x1, min_y1, max_x2, max_y2])
138
+
139
+ return merged_text_blocks, merged_coordinates
140
+
141
+
142
+ def get_perception_infos(screenshot_file):
143
+ width, height = Image.open(screenshot_file).size
144
+
145
+ text, coordinates = ocr(screenshot_file, ocr_detection, ocr_recognition)
146
+ text, coordinates = merge_text_blocks(text, coordinates)
147
+
148
+ perception_infos = []
149
+ for i in range(len(coordinates)):
150
+ perception_info = {"text": "text: " + text[i], "coordinates": coordinates[i]}
151
+ perception_infos.append(perception_info)
152
+
153
+ coordinates = det(screenshot_file, "icon", groundingdino_model)
154
+
155
+ for i in range(len(coordinates)):
156
+ perception_info = {"text": "icon", "coordinates": coordinates[i]}
157
+ perception_infos.append(perception_info)
158
+
159
+ image_box = []
160
+ image_id = []
161
+ for i in range(len(perception_infos)):
162
+ if perception_infos[i]['text'] == 'icon':
163
+ image_box.append(perception_infos[i]['coordinates'])
164
+ image_id.append(i)
165
+
166
+ for i in range(len(image_box)):
167
+ crop(screenshot_file, image_box[i], image_id[i])
168
+
169
+ images = get_all_files_in_folder(temp_file)
170
+ if len(images) > 0:
171
+ images = sorted(images, key=lambda x: int(x.split('/')[-1].split('.')[0]))
172
+ image_id = [int(image.split('/')[-1].split('.')[0]) for image in images]
173
+ icon_map = {}
174
+ prompt = 'This image is an icon from a phone screen. Please briefly describe the shape and color of this icon in one sentence.'
175
+
176
+ string_image = []
177
+ for i in range(len(images)):
178
+ image_path = os.path.join(temp_file, images[i])
179
+ string_image.append({"image_name": images[i], "image_file": encode_image(image_path)})
180
+ query_data = {"task": "caption", "images": string_image, "query": prompt}
181
+ response_query = mobile_agent_infer(query_data)
182
+ icon_map = response_query["icon_map"]
183
+
184
+ for i, j in zip(image_id, range(1, len(image_id)+1)):
185
+ if icon_map.get(str(j)):
186
+ perception_infos[i]['text'] = "icon: " + icon_map[str(j)]
187
+
188
+ for i in range(len(perception_infos)):
189
+ perception_infos[i]['coordinates'] = [int((perception_infos[i]['coordinates'][0]+perception_infos[i]['coordinates'][2])/2), int((perception_infos[i]['coordinates'][1]+perception_infos[i]['coordinates'][3])/2)]
190
+
191
+ return perception_infos, width, height
192
+
193
+
194
+ def image_to_base64(image):
195
+ buffered = io.BytesIO()
196
+ image.save(buffered, format="PNG")
197
+ img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
198
+ img_html = f'<img src="data:image/png;base64,{img_str}" />'
199
+ return img_html
200
+
201
+
202
+ def chatbot(image, instruction, add_info, history, chat_log):
203
+ if history == {}:
204
+ thought_history = []
205
+ summary_history = []
206
+ action_history = []
207
+ summary = ""
208
+ action = ""
209
+ completed_requirements = ""
210
+ memory = ""
211
+ insight = ""
212
+ error_flag = False
213
+ user_msg = "<div class='user-message'>{}</div>".format(instruction)
214
+ else:
215
+ thought_history = history["thought_history"]
216
+ summary_history = history["summary_history"]
217
+ action_history = history["action_history"]
218
+ summary = history["summary"]
219
+ action = history["action"]
220
+ completed_requirements = history["completed_requirements"]
221
+ memory = history["memory"][0]
222
+ insight = history["insight"]
223
+ error_flag = history["error_flag"]
224
+ user_msg = "<div class='user-message'>{}</div>".format("I have uploaded the screenshot. Please continue operating.")
225
+
226
+ images = get_all_files_in_folder(cache)
227
+ if len(images) > 0 and len(images) <= 100:
228
+ images = sorted(images, key=lambda x: int(x.split('/')[-1].split('.')[0]))
229
+ image_id = [int(image.split('/')[-1].split('.')[0]) for image in images]
230
+ cur_image_id = image_id[-1] + 1
231
+ elif len(images) > 100:
232
+ images = sorted(images, key=lambda x: int(x.split('/')[-1].split('.')[0]))
233
+ image_id = [int(image.split('/')[-1].split('.')[0]) for image in images]
234
+ cur_image_id = image_id[-1] + 1
235
+ os.remove(os.path.join(cache, str(image_id[0])+".png"))
236
+ else:
237
+ cur_image_id = 1
238
+
239
+ image.save(os.path.join(cache, str(cur_image_id) + ".png"), format="PNG")
240
+ screenshot_file = os.path.join(cache, str(cur_image_id) + ".png")
241
+ perception_infos, width, height = get_perception_infos(screenshot_file)
242
+ shutil.rmtree(temp_file)
243
+ os.mkdir(temp_file)
244
+
245
+ local_screenshot_file = encode_image(screenshot_file)
246
+ query_data = {
247
+ "task": "decision",
248
+ "screenshot_file": local_screenshot_file,
249
+ "instruction": instruction,
250
+ "perception_infos": perception_infos,
251
+ "width": width,
252
+ "height": height,
253
+ "summary_history": summary_history,
254
+ "action_history": action_history,
255
+ "summary": summary,
256
+ "action": action,
257
+ "add_info": add_info,
258
+ "error_flag": error_flag,
259
+ "completed_requirements": completed_requirements,
260
+ "memory": memory,
261
+ "memory_switch": True,
262
+ "insight": insight
263
+ }
264
+
265
+ response_query = mobile_agent_infer(query_data)
266
+ output_action = response_query["decision"]
267
+ output_memory = response_query["memory"]
268
+ if output_action == "No token":
269
+ bot_response = ["<div class='bot-message'>{}</div>".format("Sorry, the resources can be exhausted today.")]
270
+ chat_html = "<div class='chat-container'>{}</div>".format("".join(bot_response))
271
+ return chatbot_css + chat_html, history, chat_log
272
+
273
+ thought = output_action.split("### Thought ###")[-1].split("### Action ###")[0].replace("\n", " ").replace(":", "").replace(" ", " ").strip()
274
+ summary = output_action.split("### Operation ###")[-1].replace("\n", " ").replace(" ", " ").strip()
275
+ action = output_action.split("### Action ###")[-1].split("### Operation ###")[0].replace("\n", " ").replace(" ", " ").strip()
276
+
277
+ output_memory = output_memory.split("### Important content ###")[-1].split("\n\n")[0].strip() + "\n"
278
+ if "None" not in output_memory and output_memory not in memory:
279
+ memory += output_memory
280
+
281
+ if "Open app" in action:
282
+ bot_response = "Please click the red circle and upload the current screenshot again."
283
+ app_name = action.split("(")[-1].split(")")[0]
284
+ text, coordinate = ocr(screenshot_file, ocr_detection, ocr_recognition)
285
+ for ti in range(len(text)):
286
+ if app_name == text[ti]:
287
+ name_coordinate = [int((coordinate[ti][0] + coordinate[ti][2])/2), int((coordinate[ti][1] + coordinate[ti][3])/2)]
288
+ x, y = name_coordinate[0], name_coordinate[1]
289
+ radius = 75
290
+ draw = ImageDraw.Draw(image)
291
+ draw.ellipse([x - radius, y - radius, x + radius, y + radius], outline='red', width=10)
292
+ break
293
+
294
+ elif "Tap" in action:
295
+ bot_response = "Please click the red circle and upload the current screenshot again."
296
+ coordinate = action.split("(")[-1].split(")")[0].split(", ")
297
+ x, y = int(coordinate[0]), int(coordinate[1])
298
+ radius = 75
299
+ draw = ImageDraw.Draw(image)
300
+ draw.ellipse([x - radius, y - radius, x + radius, y + radius], outline='red', width=10)
301
+
302
+ elif "Swipe" in action:
303
+ bot_response = "Please slide from red circle to blue circle and upload the current screenshot again."
304
+ coordinate1 = action.split("Swipe (")[-1].split("), (")[0].split(", ")
305
+ coordinate2 = action.split("), (")[-1].split(")")[0].split(", ")
306
+ x1, y1 = int(coordinate1[0]), int(coordinate1[1])
307
+ x2, y2 = int(coordinate2[0]), int(coordinate2[1])
308
+ radius = 75
309
+ draw = ImageDraw.Draw(image)
310
+ draw.ellipse([x1 - radius, y1 - radius, x1 + radius, y1 + radius], outline='red', width=10)
311
+ draw.ellipse([x2 - radius, y2 - radius, x2 + radius, y2 + radius], outline='blue', width=10)
312
+
313
+ elif "Type" in action:
314
+ if "(text)" not in action:
315
+ text = action.split("(")[-1].split(")")[0]
316
+ else:
317
+ text = action.split(" \"")[-1].split("\"")[0]
318
+ bot_response = f"Please type the \"{text}\" and upload the current screenshot again."
319
+
320
+ elif "Back" in action:
321
+ bot_response = f"Please back to previous page and upload the current screenshot again."
322
+
323
+ elif "Home" in action:
324
+ bot_response = f"Please back to home page and upload the current screenshot again."
325
+
326
+ elif "Stop" in action:
327
+ bot_response = f"Task completed."
328
+
329
+ bot_text1 = "<div class='bot-message'>{}</div>".format("### Decision ###")
330
+ bot_thought = "<div class='bot-message'>{}</div>".format("Thought: " + thought)
331
+ bot_action = "<div class='bot-message'>{}</div>".format("Action: " + action)
332
+ bot_operation = "<div class='bot-message'>{}</div>".format("Operation: " + summary)
333
+ bot_text2 = "<div class='bot-message'>{}</div>".format("### Memory ###")
334
+ bot_memory = "<div class='bot-message'>{}</div>".format(output_memory)
335
+ bot_response = "<div class='bot-message'>{}</div>".format(bot_response)
336
+ if image is not None:
337
+ bot_img_html = image_to_base64(image)
338
+ bot_response = "<div class='bot-image'>{}</div>".format(bot_img_html) + bot_response
339
+
340
+ chat_log.append(user_msg)
341
+
342
+ thought_history.append(thought)
343
+ summary_history.append(summary)
344
+ action_history.append(action)
345
+
346
+ history["thought_history"] = thought_history
347
+ history["summary_history"] = summary_history
348
+ history["action_history"] = action_history
349
+ history["summary"] = summary
350
+ history["action"] = action
351
+ history["memory"] = memory,
352
+ history["memory_switch"] = True,
353
+ history["insight"] = insight
354
+ history["error_flag"] = error_flag
355
+
356
+ query_data = {
357
+ "task": "planning",
358
+ "instruction": instruction,
359
+ "thought_history": thought_history,
360
+ "summary_history": summary_history,
361
+ "action_history": action_history,
362
+ "completed_requirements": "",
363
+ "add_info": add_info
364
+ }
365
+
366
+ response_query = mobile_agent_infer(query_data)
367
+ output_planning = response_query["planning"]
368
+ if output_planning == "No token":
369
+ bot_response = ["<div class='bot-message'>{}</div>".format("Sorry, the resources can be exhausted today.")]
370
+ chat_html = "<div class='chat-container'>{}</div>".format("".join(bot_response))
371
+ return chatbot_css + chat_html, history, chat_log
372
+
373
+ output_planning = output_planning.split("### Completed contents ###")[-1].replace("\n", " ").strip()
374
+ history["completed_requirements"] = output_planning
375
+
376
+ bot_text3 = "<div class='bot-message'>{}</div>".format("### Planning ###")
377
+ output_planning = "<div class='bot-message'>{}</div>".format(output_planning)
378
+
379
+ chat_log.append(bot_text3)
380
+ chat_log.append(output_planning)
381
+ chat_log.append(bot_text1)
382
+ chat_log.append(bot_thought)
383
+ chat_log.append(bot_action)
384
+ chat_log.append(bot_operation)
385
+ chat_log.append(bot_text2)
386
+ chat_log.append(bot_memory)
387
+ chat_log.append(bot_response)
388
+
389
+ chat_html = "<div class='chat-container'>{}</div>".format("".join(chat_log))
390
+
391
+ return chatbot_css + chat_html, history, chat_log
392
+
393
+
394
+ def lock_input(instruction):
395
+ return gr.update(value=instruction, interactive=False), gr.update(value=None)
396
+
397
+
398
+ def reset_demo():
399
+ return gr.update(value="", interactive=True), gr.update(value="If you want to tap an icon of an app, use the action \"Open app\"", interactive=True), "<div class='chat-container'></div>", {}, []
400
+
401
+
402
+ tos_markdown = ("""<div style="display:flex; gap: 0.25rem;" align="center">
403
+ <a href='https://github.com/X-PLUG/MobileAgent'><img src='https://img.shields.io/badge/Github-Code-blue'></a>
404
+ <a href="https://arxiv.org/abs/2406.01014"><img src="https://img.shields.io/badge/Arxiv-2406.01014-red"></a>
405
+ <a href='https://github.com/X-PLUG/MobileAgent/stargazers'><img src='https://img.shields.io/github/stars/X-PLUG/MobileAgent.svg?style=social'></a>
406
+ </div>
407
+ If you like our project, please give us a star ✨ on Github for latest update.
408
+
409
+ **Terms of use**
410
+ 1. Input your instruction in \"Instruction\", for example \"Turn on the dark mode\".
411
+ 2. You can input helpful operation knowledge in \"Knowledge\".
412
+ 3. Click \"Submit\" to get the operation. You need to operate your mobile device according to the operation and then upload the screenshot after your operation.
413
+ 4. The 5 cases in \"Examples\" are a complete flow. Click and submit from top to bottom to experience.
414
+ 5. Due to limited resources, each operation may take a long time, please be patient and wait.
415
+
416
+ **使用说明**
417
+ 1. 在“Instruction”中输入你的指令,例如“打开深色模式”。
418
+ 2. 你可以在“Knowledge”中输入帮助性的操作知识。
419
+ 3. 点击“Submit”来获得操作。你需要根据输出来操作手机,并且上传操作后的截图。
420
+ 4. “Example”中的5个例子是一个任务。从上到下点击它们并且点击“Submit”来体验。
421
+ 5. 由于资源有限,每次操作的时间会比较长,请耐心等待。""")
422
+
423
+ title_markdowm = ("""# Mobile-Agent-v2: Mobile Device Operation Assistant with Effective Navigation via Multi-Agent Collaboration""")
424
+
425
+ instruction_input = gr.Textbox(label="Instruction", placeholder="Input your instruction")
426
+ knowledge_input = gr.Textbox(label="Knowledge", placeholder="Input your knowledge", value="If you want to tap an icon of an app, use the action \"Open app\"")
427
+ with gr.Blocks() as demo:
428
+ history_state = gr.State(value={})
429
+ history_output = gr.State(value=[])
430
+ with gr.Row():
431
+ gr.Markdown(title_markdowm)
432
+ with gr.Row():
433
+ with gr.Column(scale=5):
434
+ gr.Markdown(tos_markdown)
435
+ with gr.Row():
436
+ image_input = gr.Image(label="Screenshot", type="pil", height=550, width=230)
437
+ gr.Examples(examples=[
438
+ ["./example/example_1.jpg", "Turn on the dark mode"],
439
+ ["./example/example_2.jpg", "Turn on the dark mode"],
440
+ ["./example/example_3.jpg", "Turn on the dark mode"],
441
+ ["./example/example_4.jpg", "Turn on the dark mode"],
442
+ ["./example/example_5.jpg", "Turn on the dark mode"],
443
+ ], inputs=[image_input, instruction_input, knowledge_input])
444
+
445
+ with gr.Column(scale=6):
446
+ instruction_input.render()
447
+ knowledge_input.render()
448
+ with gr.Row():
449
+ start_button = gr.Button("Submit")
450
+ clear_button = gr.Button("Clear")
451
+ output_component = gr.HTML(label="Chat history", value="<div class='chat-container'></div>")
452
+
453
+ start_button.click(
454
+ fn=lambda image, instruction, add_info, history, output: chatbot(image, instruction, add_info, history, output),
455
+ inputs=[image_input, instruction_input, knowledge_input, history_state, history_output],
456
+ outputs=[output_component, history_state, history_output]
457
+ )
458
+
459
+ clear_button.click(
460
+ fn=reset_demo,
461
+ inputs=[],
462
+ outputs=[instruction_input, knowledge_input, output_component, history_state, history_output]
463
+ )
464
+
465
+ demo.queue().launch(share=True)
cache/1.png ADDED
cache/10.png ADDED
cache/11.png ADDED
cache/12.png ADDED
cache/13.png ADDED
cache/14.png ADDED
cache/15.png ADDED
cache/16.png ADDED
cache/17.png ADDED
cache/18.png ADDED
cache/19.png ADDED
cache/2.png ADDED
cache/20.png ADDED
cache/21.png ADDED
cache/22.png ADDED
cache/23.png ADDED
cache/24.png ADDED
cache/25.png ADDED
cache/3.png ADDED
cache/4.png ADDED
cache/5.png ADDED
cache/6.png ADDED
cache/7.png ADDED
cache/8.png ADDED
cache/9.png ADDED
example/example_1.jpg ADDED
example/example_2.jpg ADDED
example/example_3.jpg ADDED
example/example_4.jpg ADDED
example/example_5.jpg ADDED