diff --git a/MobileAgent/__pycache__/api.cpython-310.pyc b/MobileAgent/__pycache__/api.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..91a1c8e7627a511907fc53c64489c77dbde1a6b5 Binary files /dev/null and b/MobileAgent/__pycache__/api.cpython-310.pyc differ diff --git a/MobileAgent/__pycache__/api_service.cpython-310.pyc b/MobileAgent/__pycache__/api_service.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a87cf688237e95e8f5e9f548272991d16543fb2c Binary files /dev/null and b/MobileAgent/__pycache__/api_service.cpython-310.pyc differ diff --git a/MobileAgent/__pycache__/chat.cpython-310.pyc b/MobileAgent/__pycache__/chat.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1865cb10903b43e728d8e5d3750f2cbf1a0f9c77 Binary files /dev/null and b/MobileAgent/__pycache__/chat.cpython-310.pyc differ diff --git a/MobileAgent/__pycache__/controller.cpython-310.pyc b/MobileAgent/__pycache__/controller.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..15fa9b6d4e7e0fa3feacca28930f36f1a06bd9de Binary files /dev/null and b/MobileAgent/__pycache__/controller.cpython-310.pyc differ diff --git a/MobileAgent/__pycache__/crop.cpython-310.pyc b/MobileAgent/__pycache__/crop.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0d08edef8942a5a3a52c98c60be650c4bedc9b8f Binary files /dev/null and b/MobileAgent/__pycache__/crop.cpython-310.pyc differ diff --git a/MobileAgent/__pycache__/icon_localization.cpython-310.pyc b/MobileAgent/__pycache__/icon_localization.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..aec6c1465a2f53e5c2371d2cbb014f14e49361dc Binary files /dev/null and b/MobileAgent/__pycache__/icon_localization.cpython-310.pyc differ diff --git a/MobileAgent/__pycache__/local_server.cpython-310.pyc b/MobileAgent/__pycache__/local_server.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..18be8ce6f1a4f8005b36317cac3d1c8dde913e1d Binary files /dev/null and b/MobileAgent/__pycache__/local_server.cpython-310.pyc differ diff --git a/MobileAgent/__pycache__/prompt.cpython-310.pyc b/MobileAgent/__pycache__/prompt.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f762c1c53a9153ac04d11c9f7c7f7a64dbb93786 Binary files /dev/null and b/MobileAgent/__pycache__/prompt.cpython-310.pyc differ diff --git a/MobileAgent/__pycache__/prompt_no_input.cpython-310.pyc b/MobileAgent/__pycache__/prompt_no_input.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ec4be947ebd415782c06ac257d87495525842a68 Binary files /dev/null and b/MobileAgent/__pycache__/prompt_no_input.cpython-310.pyc differ diff --git a/MobileAgent/__pycache__/text_localization.cpython-310.pyc b/MobileAgent/__pycache__/text_localization.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..46b23faa35afcf87af5ae8bd4750b620a0273d79 Binary files /dev/null and b/MobileAgent/__pycache__/text_localization.cpython-310.pyc differ diff --git a/MobileAgent/api.py b/MobileAgent/api.py new file mode 100644 index 0000000000000000000000000000000000000000..72f0871681f929c220e6a80d9fefceabf82730e7 --- /dev/null +++ b/MobileAgent/api.py @@ -0,0 +1,45 @@ +import base64 +import requests + +def encode_image(image_path): + with open(image_path, "rb") as image_file: + return base64.b64encode(image_file.read()).decode('utf-8') + + +def inference_chat(chat, model, api_url, token): + headers = { + "Content-Type": "application/json", + "Authorization": f"Bearer {token}" + } + + data = { + "model": model, + "messages": [], + "max_tokens": 2048, + 'temperature': 0.0, + "seed": 1234 + } + + for role, content in chat: + data["messages"].append({"role": role, "content": content}) + + retry = 3 + cur_try = 0 + while True: + cur_try += 1 + if cur_try > retry: + return "No token" + try: + res = requests.post(api_url, headers=headers, json=data) + res_json = res.json() + res_content = res_json['data']['response']['choices'][0]['message']['content'] + except: + print("Network Error:") + try: + print(res.json()) + except: + print("Request Failed") + else: + break + + return res_content diff --git a/MobileAgent/api_service.py b/MobileAgent/api_service.py new file mode 100644 index 0000000000000000000000000000000000000000..5cc65d2fd82fb4aac061cf738ff484d6d9de33a2 --- /dev/null +++ b/MobileAgent/api_service.py @@ -0,0 +1,26 @@ +import requests +import json + + +def get_action(query_data, url, token): + + headers = { + 'Authorization': token, + 'Content-Type': 'application/json' + } + + data = { + "model": "pre-Mobile_Agent_Server_ADB_V2-2204", + "input": {"json_data": query_data} + } + + while True: + try: + response = requests.post(url, headers=headers, data=json.dumps(data)) + response.json()["output"] + except: + print("Network Error:", response.json()) + else: + break + + return response \ No newline at end of file diff --git a/MobileAgent/chat.py b/MobileAgent/chat.py new file mode 100644 index 0000000000000000000000000000000000000000..c5b1f2a622017f36b921ef4c08ab6d058112f348 --- /dev/null +++ b/MobileAgent/chat.py @@ -0,0 +1,86 @@ +import copy +from MobileAgent.api import encode_image + + +def init_action_chat(): + operation_history = [] + sysetm_prompt = "You are a helpful AI mobile phone operating assistant. You need to help me operate the phone to complete the user\'s instruction." + operation_history.append(["system", [{"type": "text", "text": sysetm_prompt}]]) + return operation_history + + +def init_reflect_chat(): + operation_history = [] + sysetm_prompt = "You are a helpful AI mobile phone operating assistant." + operation_history.append(["system", [{"type": "text", "text": sysetm_prompt}]]) + return operation_history + + +def init_memory_chat(): + operation_history = [] + sysetm_prompt = "You are a helpful AI mobile phone operating assistant." + operation_history.append(["system", [{"type": "text", "text": sysetm_prompt}]]) + return operation_history + + +def add_response(role, prompt, chat_history, image=None): + new_chat_history = copy.deepcopy(chat_history) + if image: + base64_image = encode_image(image) + content = [ + { + "type": "text", + "text": prompt + }, + { + "type": "image_url", + "image_url": { + "url": f"data:image/jpeg;base64,{base64_image}" + } + }, + ] + else: + content = [ + { + "type": "text", + "text": prompt + }, + ] + new_chat_history.append([role, content]) + return new_chat_history + + +def add_response_two_image(role, prompt, chat_history, image): + new_chat_history = copy.deepcopy(chat_history) + + base64_image1 = encode_image(image[0]) + base64_image2 = encode_image(image[1]) + content = [ + { + "type": "text", + "text": prompt + }, + { + "type": "image_url", + "image_url": { + "url": f"data:image/jpeg;base64,{base64_image1}" + } + }, + { + "type": "image_url", + "image_url": { + "url": f"data:image/jpeg;base64,{base64_image2}" + } + }, + ] + + new_chat_history.append([role, content]) + return new_chat_history + + +def print_status(chat_history): + print("*"*100) + for chat in chat_history: + print("role:", chat[0]) + print(chat[1][0]["text"] + ""*(len(chat[1])-1) + "\n") + print("*"*100) \ No newline at end of file diff --git a/MobileAgent/crop.py b/MobileAgent/crop.py new file mode 100644 index 0000000000000000000000000000000000000000..22efc87a640239bb5fec615230efc515b6592bf0 --- /dev/null +++ b/MobileAgent/crop.py @@ -0,0 +1,141 @@ +import math +import cv2 +import numpy as np +from PIL import Image, ImageDraw, ImageFont +import clip +import torch + + +def crop_image(img, position): + def distance(x1,y1,x2,y2): + return math.sqrt(pow(x1 - x2, 2) + pow(y1 - y2, 2)) + position = position.tolist() + for i in range(4): + for j in range(i+1, 4): + if(position[i][0] > position[j][0]): + tmp = position[j] + position[j] = position[i] + position[i] = tmp + if position[0][1] > position[1][1]: + tmp = position[0] + position[0] = position[1] + position[1] = tmp + + if position[2][1] > position[3][1]: + tmp = position[2] + position[2] = position[3] + position[3] = tmp + + x1, y1 = position[0][0], position[0][1] + x2, y2 = position[2][0], position[2][1] + x3, y3 = position[3][0], position[3][1] + x4, y4 = position[1][0], position[1][1] + + corners = np.zeros((4,2), np.float32) + corners[0] = [x1, y1] + corners[1] = [x2, y2] + corners[2] = [x4, y4] + corners[3] = [x3, y3] + + img_width = distance((x1+x4)/2, (y1+y4)/2, (x2+x3)/2, (y2+y3)/2) + img_height = distance((x1+x2)/2, (y1+y2)/2, (x4+x3)/2, (y4+y3)/2) + + corners_trans = np.zeros((4,2), np.float32) + corners_trans[0] = [0, 0] + corners_trans[1] = [img_width - 1, 0] + corners_trans[2] = [0, img_height - 1] + corners_trans[3] = [img_width - 1, img_height - 1] + + transform = cv2.getPerspectiveTransform(corners, corners_trans) + dst = cv2.warpPerspective(img, transform, (int(img_width), int(img_height))) + return dst + + +def calculate_size(box): + return (box[2]-box[0]) * (box[3]-box[1]) + + +def calculate_iou(box1, box2): + xA = max(box1[0], box2[0]) + yA = max(box1[1], box2[1]) + xB = min(box1[2], box2[2]) + yB = min(box1[3], box2[3]) + + interArea = max(0, xB - xA) * max(0, yB - yA) + box1Area = (box1[2] - box1[0]) * (box1[3] - box1[1]) + box2Area = (box2[2] - box2[0]) * (box2[3] - box2[1]) + unionArea = box1Area + box2Area - interArea + iou = interArea / unionArea + + return iou + + +def crop(image, box, i, text_data=None): + image = Image.open(image) + + if text_data: + draw = ImageDraw.Draw(image) + draw.rectangle(((text_data[0], text_data[1]), (text_data[2], text_data[3])), outline="red", width=5) + # font_size = int((text_data[3] - text_data[1])*0.75) + # font = ImageFont.truetype("arial.ttf", font_size) + # draw.text((text_data[0]+5, text_data[1]+5), str(i), font=font, fill="red") + + cropped_image = image.crop(box) + cropped_image.save(f"./temp/{i}.jpg") + + +def in_box(box, target): + if (box[0] > target[0]) and (box[1] > target[1]) and (box[2] < target[2]) and (box[3] < target[3]): + return True + else: + return False + + +def crop_for_clip(image, box, i, position): + image = Image.open(image) + w, h = image.size + if position == "left": + bound = [0, 0, w/2, h] + elif position == "right": + bound = [w/2, 0, w, h] + elif position == "top": + bound = [0, 0, w, h/2] + elif position == "bottom": + bound = [0, h/2, w, h] + elif position == "top left": + bound = [0, 0, w/2, h/2] + elif position == "top right": + bound = [w/2, 0, w, h/2] + elif position == "bottom left": + bound = [0, h/2, w/2, h] + elif position == "bottom right": + bound = [w/2, h/2, w, h] + else: + bound = [0, 0, w, h] + + if in_box(box, bound): + cropped_image = image.crop(box) + cropped_image.save(f"./temp/{i}.jpg") + return True + else: + return False + + +def clip_for_icon(clip_model, clip_preprocess, images, prompt): + image_features = [] + for image_file in images: + image = clip_preprocess(Image.open(image_file)).unsqueeze(0).to(next(clip_model.parameters()).device) + image_feature = clip_model.encode_image(image) + image_features.append(image_feature) + image_features = torch.cat(image_features) + + text = clip.tokenize([prompt]).to(next(clip_model.parameters()).device) + text_features = clip_model.encode_text(text) + + image_features /= image_features.norm(dim=-1, keepdim=True) + text_features /= text_features.norm(dim=-1, keepdim=True) + similarity = (100.0 * image_features @ text_features.T).softmax(dim=0).squeeze(0) + _, max_pos = torch.max(similarity, dim=0) + pos = max_pos.item() + + return pos diff --git a/MobileAgent/icon_localization.py b/MobileAgent/icon_localization.py new file mode 100644 index 0000000000000000000000000000000000000000..050912e659c2a4d75ffe47a5c7472462cbdeac51 --- /dev/null +++ b/MobileAgent/icon_localization.py @@ -0,0 +1,59 @@ +from MobileAgent.crop import calculate_size, calculate_iou +from modelscope.pipelines import pipeline +from PIL import Image +import torch + +def remove_boxes(boxes_filt, size, iou_threshold=0.5): + boxes_to_remove = set() + + for i in range(len(boxes_filt)): + if calculate_size(boxes_filt[i]) > 0.05*size[0]*size[1]: + boxes_to_remove.add(i) + for j in range(len(boxes_filt)): + if calculate_size(boxes_filt[j]) > 0.05*size[0]*size[1]: + boxes_to_remove.add(j) + if i == j: + continue + if i in boxes_to_remove or j in boxes_to_remove: + continue + iou = calculate_iou(boxes_filt[i], boxes_filt[j]) + if iou >= iou_threshold: + boxes_to_remove.add(j) + + boxes_filt = [box for idx, box in enumerate(boxes_filt) if idx not in boxes_to_remove] + + return boxes_filt + + +def det(input_image_path, caption, groundingdino_model, box_threshold=0.05, text_threshold=0.5): + image = Image.open(input_image_path) + size = image.size + + caption = caption.lower() + caption = caption.strip() + if not caption.endswith('.'): + caption = caption + '.' + + inputs = { + 'IMAGE_PATH': input_image_path, + 'TEXT_PROMPT': caption, + 'BOX_TRESHOLD': box_threshold, + 'TEXT_TRESHOLD': text_threshold + } + + result = groundingdino_model(inputs) + boxes_filt = result['boxes'] + + H, W = size[1], size[0] + for i in range(boxes_filt.size(0)): + boxes_filt[i] = boxes_filt[i] * torch.Tensor([W, H, W, H]) + boxes_filt[i][:2] -= boxes_filt[i][2:] / 2 + boxes_filt[i][2:] += boxes_filt[i][:2] + + boxes_filt = boxes_filt.cpu().int().tolist() + filtered_boxes = remove_boxes(boxes_filt, size) # [:9] + coordinates = [] + for box in filtered_boxes: + coordinates.append([box[0], box[1], box[2], box[3]]) + + return coordinates \ No newline at end of file diff --git a/MobileAgent/local_server.py b/MobileAgent/local_server.py new file mode 100644 index 0000000000000000000000000000000000000000..58e2e37617714a53c243b58d04615bd21613c4b6 --- /dev/null +++ b/MobileAgent/local_server.py @@ -0,0 +1,172 @@ +import os +import base64 +from io import BytesIO +from PIL import Image + +from MobileAgent.api import inference_chat +from MobileAgent.prompt_no_input import get_action_prompt, get_reflect_prompt, get_memory_prompt, get_process_prompt +from MobileAgent.chat import init_action_chat, init_reflect_chat, init_memory_chat, add_response, add_response_two_image + +from dashscope import MultiModalConversation +import dashscope +import concurrent + + +API_url = os.environ.get('url') +token = os.environ.get('token') + + +def base64_to_pil(base64_string): + if base64_string.startswith('data:image'): + base64_string = base64_string.split(',')[-1] + image_data = base64.b64decode(base64_string) + image_stream = BytesIO(image_data) + pil_image = Image.open(image_stream) + return pil_image + + +def process_image(image, query): + dashscope.api_key = os.environ.get('qwen') + image = "file://" + image + messages = [{ + 'role': 'user', + 'content': [ + { + 'image': image + }, + { + 'text': query + }, + ] + }] + response = MultiModalConversation.call(model="qwen-vl-plus", messages=messages) + + try: + response = response['output']['choices'][0]['message']['content'][0]["text"] + except: + response = "This is an icon." + + return response + + +if not os.path.exists("screenshot"): + os.mkdir("screenshot") +if not os.path.exists("temp"): + os.mkdir("temp") + + +def mobile_agent_infer(json_data): + task = json_data["task"] + if task == "caption": + query = json_data["query"] + images = json_data["images"] + local_images = [] + for image in images: + image_name = image["image_name"] + image_file = image["image_file"] + image_file = base64_to_pil(image_file) + image_path = "temp/" + image_name + image_file.save(image_path, "PNG") + local_images.append(image_path) + + icon_map = {} + with concurrent.futures.ThreadPoolExecutor() as executor: + futures = {executor.submit(process_image, image, query): i for i, image in enumerate(local_images)} + + for future in concurrent.futures.as_completed(futures): + i = futures[future] + response = future.result() + icon_map[i + 1] = response + + output = {"icon_map": icon_map} + return output + + elif task == "planning": + instruction = json_data["instruction"] + thought_history = json_data["thought_history"] + summary_history = json_data["summary_history"] + action_history = json_data["action_history"] + completed_requirements = json_data["completed_requirements"] + add_info = json_data["add_info"] + + prompt_planning = get_process_prompt(instruction, thought_history, summary_history, action_history, + completed_requirements, add_info) + chat_planning = init_memory_chat() + chat_planning = add_response("user", prompt_planning, chat_planning) + output_planning = inference_chat(chat_planning, 'gpt-4-turbo', API_url, token) + + output = {"planning": output_planning} + return output + + elif task == "decision": + screenshot_file = json_data["screenshot_file"] + screenshot_file = base64_to_pil(screenshot_file) + image_path = "screenshot/screenshot_local.png" + screenshot_file.save(image_path, "PNG") + + instruction = json_data["instruction"] + perception_infos = json_data["perception_infos"] + width = json_data["width"] + height = json_data["height"] + summary_history = json_data["summary_history"] + action_history = json_data["action_history"] + summary = json_data["summary"] + action = json_data["action"] + add_info = json_data["add_info"] + error_flag = json_data["error_flag"] + completed_requirements = json_data["completed_requirements"] + memory = json_data["memory"] + memory_switch = json_data["memory_switch"] + insight = json_data["insight"] + + prompt_action = get_action_prompt(instruction, perception_infos, width, height, summary_history, + action_history, summary, action, add_info, error_flag, completed_requirements, + memory) + chat_action = init_action_chat() + chat_action = add_response("user", prompt_action, chat_action, image_path) + output_action = inference_chat(chat_action, 'gpt-4o', API_url, token) + if output_action == "No token": + output = {"decision": "No token", "memory": None} + return output + chat_action = add_response("assistant", output_action, chat_action) + + output_memory = None + if memory_switch: + prompt_memory = get_memory_prompt(insight) + chat_action = add_response("user", prompt_memory, chat_action) + output_memory = inference_chat(chat_action, 'gpt-4o', API_url, token) + + output = {"decision": output_action, "memory": output_memory} + return output + + elif task == "reflection": + screenshot_file = json_data["screenshot_file"] + screenshot_file = base64_to_pil(screenshot_file) + image_path = "screenshot/screenshot_local.png" + screenshot_file.save(image_path, "PNG") + last_screenshot_file = json_data["last_screenshot_file"] + last_screenshot_file = base64_to_pil(last_screenshot_file) + last_image_path = "screenshot/last_screenshot_local.png" + last_screenshot_file.save(last_image_path, "PNG") + + instruction = json_data["instruction"] + last_perception_infos = json_data["last_perception_infos"] + perception_infos = json_data["perception_infos"] + width = json_data["width"] + height = json_data["height"] + summary = json_data["summary"] + action = json_data["action"] + add_info = json_data["add_info"] + + prompt_reflect = get_reflect_prompt(instruction, last_perception_infos, perception_infos, width, height, + summary, action, add_info) + chat_reflect = init_reflect_chat() + chat_reflect = add_response_two_image("user", prompt_reflect, chat_reflect, [last_image_path, image_path]) + output_reflect = inference_chat(chat_reflect, 'gpt-4o', API_url, token) + + output = {"reflection": output_reflect} + return output + + else: + output = {"error": "The task must be in \"caption\", \"planning\", \"decision\" and \"reflection\"."} + return output \ No newline at end of file diff --git a/MobileAgent/prompt_no_input.py b/MobileAgent/prompt_no_input.py new file mode 100644 index 0000000000000000000000000000000000000000..e328a2a097a1e4c20939534b11d3f50888bfce13 --- /dev/null +++ b/MobileAgent/prompt_no_input.py @@ -0,0 +1,174 @@ +def get_action_prompt(instruction, clickable_infos, width, height, summary_history, action_history, last_summary, last_action, add_info, error_flag, completed_content, memory): + prompt = "### Background ###\n" + prompt += f"This image is a phone screenshot. Its width is {width} pixels and its height is {height} pixels. The user\'s instruction is: {instruction}.\n\n" + + prompt += "### Screenshot information ###\n" + prompt += "In order to help you better perceive the content in this screenshot, we extract some information on the current screenshot through system files. " + prompt += "This information consists of two parts: coordinates; content. " + prompt += "The format of the coordinates is [x, y], x is the pixel from left to right and y is the pixel from top to bottom; the content is a text or an icon description respectively. " + prompt += "The information is as follow:\n" + + for clickable_info in clickable_infos: + if clickable_info['text'] != "" and clickable_info['text'] != "icon: None" and clickable_info['coordinates'] != (0, 0): + prompt += f"{clickable_info['coordinates']}; {clickable_info['text']}\n" + + prompt += "Please note that this information is not necessarily accurate. You need to combine the screenshot to understand." + prompt += "\n\n" + + if add_info != "": + prompt += "### Hint ###\n" + prompt += "There are hints to help you complete the user\'s instructions. The hints are as follow:\n" + prompt += add_info + prompt += "\n\n" + + if len(action_history) > 0: + prompt += "### History operations ###\n" + prompt += "Before reaching this page, some operations have been completed. You need to refer to the completed operations to decide the next operation. These operations are as follow:\n" + for i in range(len(action_history)): + prompt += f"Step-{i+1}: [Operation: " + summary_history[i].split(" to ")[0].strip() + "; Action: " + action_history[i] + "]\n" + prompt += "\n" + + if completed_content != "": + prompt += "### Progress ###\n" + prompt += "After completing the history operations, you have the following thoughts about the progress of user\'s instruction completion:\n" + prompt += "Completed contents:\n" + completed_content + "\n\n" + + if memory != "": + prompt += "### Memory ###\n" + prompt += "During the operations, you record the following contents on the screenshot for use in subsequent operations:\n" + prompt += "Memory:\n" + memory + "\n" + + if error_flag: + prompt += "### Last operation ###\n" + prompt += f"You previously wanted to perform the operation \"{last_summary}\" on this page and executed the Action \"{last_action}\". But you find that this operation does not meet your expectation. You need to reflect and revise your operation this time." + prompt += "\n\n" + + prompt += "### Response requirements ###\n" + prompt += "Now you need to combine all of the above to perform just one action on the current page. You must choose one of the six actions below:\n" + prompt += "Open app (app name): If the current page is desktop, you can use this action to open the app named \"app name\" on the desktop.\n" + prompt += "Tap (x, y): Tap the position (x, y) in current page.\n" + prompt += "Swipe (x1, y1), (x2, y2): Swipe from position (x1, y1) to position (x2, y2).\n" + prompt += "Type (text): Type the \"text\" in the input box.\n" + prompt += "Home: Return to home page.\n" + prompt += "Stop: If you think all the requirements of user\'s instruction have been completed and no further operation is required, you can choose this action to terminate the operation process." + prompt += "\n\n" + + prompt += "### Output format ###\n" + prompt += "Your output consists of the following three parts:\n" + prompt += "### Thought ###\nThink about the requirements that have been completed in previous operations and the requirements that need to be completed in the next one operation.\n" + prompt += "### Action ###\nYou can only choose one from the six actions above. Make sure that the coordinates or text in the \"()\".\n" + prompt += "### Operation ###\nPlease generate a brief natural language description for the operation in Action based on your Thought." + + return prompt + + +def get_reflect_prompt(instruction, clickable_infos1, clickable_infos2, width, height, summary, action, add_info): + prompt = f"These images are two phone screenshots before and after an operation. Their widths are {width} pixels and their heights are {height} pixels.\n\n" + + prompt += "In order to help you better perceive the content in this screenshot, we extract some information on the current screenshot through system files. " + prompt += "The information consists of two parts, consisting of format: coordinates; content. " + prompt += "The format of the coordinates is [x, y], x is the pixel from left to right and y is the pixel from top to bottom; the content is a text or an icon description respectively " + prompt += "The keyboard status is whether the keyboard of the current page is activated." + prompt += "\n\n" + + prompt += "### Before the current operation ###\n" + prompt += "Screenshot information:\n" + for clickable_info in clickable_infos1: + if clickable_info['text'] != "" and clickable_info['text'] != "icon: None" and clickable_info['coordinates'] != (0, 0): + prompt += f"{clickable_info['coordinates']}; {clickable_info['text']}\n" + prompt += "\n" + + prompt += "### After the current operation ###\n" + prompt += "Screenshot information:\n" + for clickable_info in clickable_infos2: + if clickable_info['text'] != "" and clickable_info['text'] != "icon: None" and clickable_info['coordinates'] != (0, 0): + prompt += f"{clickable_info['coordinates']}; {clickable_info['text']}\n" + prompt += "\n" + + prompt += "### Current operation ###\n" + prompt += f"The user\'s instruction is: {instruction}. You also need to note the following requirements: {add_info}. In the process of completing the requirements of instruction, an operation is performed on the phone. Below are the details of this operation:\n" + prompt += "Operation thought: " + summary.split(" to ")[0].strip() + "\n" + prompt += "Operation action: " + action + prompt += "\n\n" + + prompt += "### Response requirements ###\n" + prompt += "Now you need to output the following content based on the screenshots before and after the current operation:\n" + prompt += "Whether the result of the \"Operation action\" meets your expectation of \"Operation thought\"?\n" + prompt += "A: The result of the \"Operation action\" meets my expectation of \"Operation thought\".\n" + prompt += "B: The \"Operation action\" results in a wrong page and I need to return to the previous page.\n" + prompt += "C: The \"Operation action\" produces no changes." + prompt += "\n\n" + + prompt += "### Output format ###\n" + prompt += "Your output format is:\n" + prompt += "### Thought ###\nYour thought about the question\n" + prompt += "### Answer ###\nA or B or C" + + return prompt + + +def get_memory_prompt(insight): + if insight != "": + prompt = "### Important content ###\n" + prompt += insight + prompt += "\n\n" + + prompt += "### Response requirements ###\n" + prompt += "Please think about whether there is any content closely related to ### Important content ### on the current page? If there is, please output the content. If not, please output \"None\".\n\n" + + else: + prompt = "### Response requirements ###\n" + prompt += "Please think about whether there is any content closely related to user\'s instrcution on the current page? If there is, please output the content. If not, please output \"None\".\n\n" + + prompt += "### Output format ###\n" + prompt += "Your output format is:\n" + prompt += "### Important content ###\nThe content or None. Please do not repeatedly output the information in ### Memory ###." + + return prompt + +def get_process_prompt(instruction, thought_history, summary_history, action_history, completed_content, add_info): + prompt = "### Background ###\n" + prompt += f"There is an user\'s instruction which is: {instruction}. You are a mobile phone operating assistant and are operating the user\'s mobile phone.\n\n" + + if add_info != "": + prompt += "### Hint ###\n" + prompt += "There are hints to help you complete the user\'s instructions. The hints are as follow:\n" + prompt += add_info + prompt += "\n\n" + + if len(thought_history) > 1: + prompt += "### History operations ###\n" + prompt += "To complete the requirements of user\'s instruction, you have performed a series of operations. These operations are as follow:\n" + for i in range(len(summary_history)): + operation = summary_history[i].split(" to ")[0].strip() + prompt += f"Step-{i+1}: [Operation thought: " + operation + "; Operation action: " + action_history[i] + "]\n" + prompt += "\n" + + prompt += "### Progress thinking ###\n" + prompt += "After completing the history operations, you have the following thoughts about the progress of user\'s instruction completion:\n" + prompt += "Completed contents:\n" + completed_content + "\n\n" + + prompt += "### Response requirements ###\n" + prompt += "Now you need to update the \"Completed contents\". Completed contents is a general summary of the current contents that have been completed based on the ### History operations ###.\n\n" + + prompt += "### Output format ###\n" + prompt += "Your output format is:\n" + prompt += "### Completed contents ###\nUpdated Completed contents. Don\'t output the purpose of any operation. Just summarize the contents that have been actually completed in the ### History operations ###." + + else: + prompt += "### Current operation ###\n" + prompt += "To complete the requirements of user\'s instruction, you have performed an operation. Your operation thought and action of this operation are as follows:\n" + prompt += f"Operation thought: {thought_history[-1]}\n" + operation = summary_history[-1].split(" to ")[0].strip() + prompt += f"Operation action: {operation}\n\n" + + prompt += "### Response requirements ###\n" + prompt += "Now you need to combine all of the above to generate the \"Completed contents\".\n" + prompt += "Completed contents is a general summary of the current contents that have been completed. You need to first focus on the requirements of user\'s instruction, and then summarize the contents that have been completed.\n\n" + + prompt += "### Output format ###\n" + prompt += "Your output format is:\n" + prompt += "### Completed contents ###\nGenerated Completed contents. Don\'t output the purpose of any operation. Just summarize the contents that have been actually completed in the ### Current operation ###.\n" + prompt += "(Please use English to output)" + + return prompt \ No newline at end of file diff --git a/MobileAgent/text_localization.py b/MobileAgent/text_localization.py new file mode 100644 index 0000000000000000000000000000000000000000..f2628d54e9dbe4618904d83ea0abb794bf3f5ee2 --- /dev/null +++ b/MobileAgent/text_localization.py @@ -0,0 +1,58 @@ +import cv2 +import numpy as np +from MobileAgent.crop import crop_image, calculate_size +from PIL import Image + + +def order_point(coor): + arr = np.array(coor).reshape([4, 2]) + sum_ = np.sum(arr, 0) + centroid = sum_ / arr.shape[0] + theta = np.arctan2(arr[:, 1] - centroid[1], arr[:, 0] - centroid[0]) + sort_points = arr[np.argsort(theta)] + sort_points = sort_points.reshape([4, -1]) + if sort_points[0][0] > centroid[0]: + sort_points = np.concatenate([sort_points[3:], sort_points[:3]]) + sort_points = sort_points.reshape([4, 2]).astype('float32') + return sort_points + + +def longest_common_substring_length(str1, str2): + m = len(str1) + n = len(str2) + dp = [[0] * (n + 1) for _ in range(m + 1)] + + for i in range(1, m + 1): + for j in range(1, n + 1): + if str1[i - 1] == str2[j - 1]: + dp[i][j] = dp[i - 1][j - 1] + 1 + else: + dp[i][j] = max(dp[i - 1][j], dp[i][j - 1]) + + return dp[m][n] + + +def ocr(image_path, ocr_detection, ocr_recognition): + text_data = [] + coordinate = [] + + image_full = cv2.imread(image_path) + det_result = ocr_detection(image_full) + det_result = det_result['polygons'] + for i in range(det_result.shape[0]): + pts = order_point(det_result[i]) + image_crop = crop_image(image_full, pts) + + try: + result = ocr_recognition(image_crop)['text'][0] + except: + continue + + box = [int(e) for e in list(pts.reshape(-1))] + box = [box[0], box[1], box[4], box[5]] + + text_data.append(result) + coordinate.append(box) + + else: + return text_data, coordinate \ No newline at end of file diff --git a/README.md b/README.md index 92796623be67f8d82a64c58383c495c5dc56796c..dd2803cc965547910eb3bb808a6a3dc24d2d6136 100644 --- a/README.md +++ b/README.md @@ -1,12 +1,13 @@ --- title: Mobile Agent -emoji: 😻 -colorFrom: blue -colorTo: red +emoji: 🦀 +colorFrom: indigo +colorTo: green sdk: gradio -sdk_version: 4.37.1 +sdk_version: 4.19.1 app_file: app.py pinned: false +license: mit --- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference diff --git a/app.py b/app.py new file mode 100644 index 0000000000000000000000000000000000000000..4e24b64fcac80d26470061cf76e6562b4c5a8bdd --- /dev/null +++ b/app.py @@ -0,0 +1,465 @@ +import io +import os +import shutil +import base64 +import gradio as gr +from PIL import Image, ImageDraw + +from MobileAgent.text_localization import ocr +from MobileAgent.icon_localization import det +from MobileAgent.local_server import mobile_agent_infer + +from modelscope import snapshot_download +from modelscope.pipelines import pipeline +from modelscope.utils.constant import Tasks + + +chatbot_css = """ + +""" + + +temp_file = "temp" +screenshot = "screenshot" +cache = "cache" +if not os.path.exists(temp_file): + os.mkdir(temp_file) +if not os.path.exists(screenshot): + os.mkdir(screenshot) +if not os.path.exists(cache): + os.mkdir(cache) + + +groundingdino_dir = snapshot_download('AI-ModelScope/GroundingDINO', revision='v1.0.0') +groundingdino_model = pipeline('grounding-dino-task', model=groundingdino_dir) +ocr_detection = pipeline(Tasks.ocr_detection, model='damo/cv_resnet18_ocr-detection-line-level_damo') +ocr_recognition = pipeline(Tasks.ocr_recognition, model='damo/cv_convnextTiny_ocr-recognition-document_damo') + + +def encode_image(image_path): + with open(image_path, "rb") as image_file: + return base64.b64encode(image_file.read()).decode('utf-8') + + +def get_all_files_in_folder(folder_path): + file_list = [] + for file_name in os.listdir(folder_path): + file_list.append(file_name) + return file_list + + +def crop(image, box, i): + image = Image.open(image) + x1, y1, x2, y2 = int(box[0]), int(box[1]), int(box[2]), int(box[3]) + if x1 >= x2-10 or y1 >= y2-10: + return + cropped_image = image.crop((x1, y1, x2, y2)) + cropped_image.save(f"./temp/{i}.png", format="PNG") + + +def merge_text_blocks(text_list, coordinates_list): + merged_text_blocks = [] + merged_coordinates = [] + + sorted_indices = sorted(range(len(coordinates_list)), key=lambda k: (coordinates_list[k][1], coordinates_list[k][0])) + sorted_text_list = [text_list[i] for i in sorted_indices] + sorted_coordinates_list = [coordinates_list[i] for i in sorted_indices] + + num_blocks = len(sorted_text_list) + merge = [False] * num_blocks + + for i in range(num_blocks): + if merge[i]: + continue + + anchor = i + + group_text = [sorted_text_list[anchor]] + group_coordinates = [sorted_coordinates_list[anchor]] + + for j in range(i+1, num_blocks): + if merge[j]: + continue + + if abs(sorted_coordinates_list[anchor][0] - sorted_coordinates_list[j][0]) < 10 and \ + sorted_coordinates_list[j][1] - sorted_coordinates_list[anchor][3] >= -10 and sorted_coordinates_list[j][1] - sorted_coordinates_list[anchor][3] < 30 and \ + abs(sorted_coordinates_list[anchor][3] - sorted_coordinates_list[anchor][1] - (sorted_coordinates_list[j][3] - sorted_coordinates_list[j][1])) < 10: + group_text.append(sorted_text_list[j]) + group_coordinates.append(sorted_coordinates_list[j]) + merge[anchor] = True + anchor = j + merge[anchor] = True + + merged_text = "\n".join(group_text) + min_x1 = min(group_coordinates, key=lambda x: x[0])[0] + min_y1 = min(group_coordinates, key=lambda x: x[1])[1] + max_x2 = max(group_coordinates, key=lambda x: x[2])[2] + max_y2 = max(group_coordinates, key=lambda x: x[3])[3] + + merged_text_blocks.append(merged_text) + merged_coordinates.append([min_x1, min_y1, max_x2, max_y2]) + + return merged_text_blocks, merged_coordinates + + +def get_perception_infos(screenshot_file): + width, height = Image.open(screenshot_file).size + + text, coordinates = ocr(screenshot_file, ocr_detection, ocr_recognition) + text, coordinates = merge_text_blocks(text, coordinates) + + perception_infos = [] + for i in range(len(coordinates)): + perception_info = {"text": "text: " + text[i], "coordinates": coordinates[i]} + perception_infos.append(perception_info) + + coordinates = det(screenshot_file, "icon", groundingdino_model) + + for i in range(len(coordinates)): + perception_info = {"text": "icon", "coordinates": coordinates[i]} + perception_infos.append(perception_info) + + image_box = [] + image_id = [] + for i in range(len(perception_infos)): + if perception_infos[i]['text'] == 'icon': + image_box.append(perception_infos[i]['coordinates']) + image_id.append(i) + + for i in range(len(image_box)): + crop(screenshot_file, image_box[i], image_id[i]) + + images = get_all_files_in_folder(temp_file) + if len(images) > 0: + images = sorted(images, key=lambda x: int(x.split('/')[-1].split('.')[0])) + image_id = [int(image.split('/')[-1].split('.')[0]) for image in images] + icon_map = {} + prompt = 'This image is an icon from a phone screen. Please briefly describe the shape and color of this icon in one sentence.' + + string_image = [] + for i in range(len(images)): + image_path = os.path.join(temp_file, images[i]) + string_image.append({"image_name": images[i], "image_file": encode_image(image_path)}) + query_data = {"task": "caption", "images": string_image, "query": prompt} + response_query = mobile_agent_infer(query_data) + icon_map = response_query["icon_map"] + + for i, j in zip(image_id, range(1, len(image_id)+1)): + if icon_map.get(str(j)): + perception_infos[i]['text'] = "icon: " + icon_map[str(j)] + + for i in range(len(perception_infos)): + perception_infos[i]['coordinates'] = [int((perception_infos[i]['coordinates'][0]+perception_infos[i]['coordinates'][2])/2), int((perception_infos[i]['coordinates'][1]+perception_infos[i]['coordinates'][3])/2)] + + return perception_infos, width, height + + +def image_to_base64(image): + buffered = io.BytesIO() + image.save(buffered, format="PNG") + img_str = base64.b64encode(buffered.getvalue()).decode("utf-8") + img_html = f'' + return img_html + + +def chatbot(image, instruction, add_info, history, chat_log): + if history == {}: + thought_history = [] + summary_history = [] + action_history = [] + summary = "" + action = "" + completed_requirements = "" + memory = "" + insight = "" + error_flag = False + user_msg = "
{}
".format(instruction) + else: + thought_history = history["thought_history"] + summary_history = history["summary_history"] + action_history = history["action_history"] + summary = history["summary"] + action = history["action"] + completed_requirements = history["completed_requirements"] + memory = history["memory"][0] + insight = history["insight"] + error_flag = history["error_flag"] + user_msg = "
{}
".format("I have uploaded the screenshot. Please continue operating.") + + images = get_all_files_in_folder(cache) + if len(images) > 0 and len(images) <= 100: + images = sorted(images, key=lambda x: int(x.split('/')[-1].split('.')[0])) + image_id = [int(image.split('/')[-1].split('.')[0]) for image in images] + cur_image_id = image_id[-1] + 1 + elif len(images) > 100: + images = sorted(images, key=lambda x: int(x.split('/')[-1].split('.')[0])) + image_id = [int(image.split('/')[-1].split('.')[0]) for image in images] + cur_image_id = image_id[-1] + 1 + os.remove(os.path.join(cache, str(image_id[0])+".png")) + else: + cur_image_id = 1 + + image.save(os.path.join(cache, str(cur_image_id) + ".png"), format="PNG") + screenshot_file = os.path.join(cache, str(cur_image_id) + ".png") + perception_infos, width, height = get_perception_infos(screenshot_file) + shutil.rmtree(temp_file) + os.mkdir(temp_file) + + local_screenshot_file = encode_image(screenshot_file) + query_data = { + "task": "decision", + "screenshot_file": local_screenshot_file, + "instruction": instruction, + "perception_infos": perception_infos, + "width": width, + "height": height, + "summary_history": summary_history, + "action_history": action_history, + "summary": summary, + "action": action, + "add_info": add_info, + "error_flag": error_flag, + "completed_requirements": completed_requirements, + "memory": memory, + "memory_switch": True, + "insight": insight + } + + response_query = mobile_agent_infer(query_data) + output_action = response_query["decision"] + output_memory = response_query["memory"] + if output_action == "No token": + bot_response = ["
{}
".format("Sorry, the resources can be exhausted today.")] + chat_html = "
{}
".format("".join(bot_response)) + return chatbot_css + chat_html, history, chat_log + + thought = output_action.split("### Thought ###")[-1].split("### Action ###")[0].replace("\n", " ").replace(":", "").replace(" ", " ").strip() + summary = output_action.split("### Operation ###")[-1].replace("\n", " ").replace(" ", " ").strip() + action = output_action.split("### Action ###")[-1].split("### Operation ###")[0].replace("\n", " ").replace(" ", " ").strip() + + output_memory = output_memory.split("### Important content ###")[-1].split("\n\n")[0].strip() + "\n" + if "None" not in output_memory and output_memory not in memory: + memory += output_memory + + if "Open app" in action: + bot_response = "Please click the red circle and upload the current screenshot again." + app_name = action.split("(")[-1].split(")")[0] + text, coordinate = ocr(screenshot_file, ocr_detection, ocr_recognition) + for ti in range(len(text)): + if app_name == text[ti]: + name_coordinate = [int((coordinate[ti][0] + coordinate[ti][2])/2), int((coordinate[ti][1] + coordinate[ti][3])/2)] + x, y = name_coordinate[0], name_coordinate[1] + radius = 75 + draw = ImageDraw.Draw(image) + draw.ellipse([x - radius, y - radius, x + radius, y + radius], outline='red', width=10) + break + + elif "Tap" in action: + bot_response = "Please click the red circle and upload the current screenshot again." + coordinate = action.split("(")[-1].split(")")[0].split(", ") + x, y = int(coordinate[0]), int(coordinate[1]) + radius = 75 + draw = ImageDraw.Draw(image) + draw.ellipse([x - radius, y - radius, x + radius, y + radius], outline='red', width=10) + + elif "Swipe" in action: + bot_response = "Please slide from red circle to blue circle and upload the current screenshot again." + coordinate1 = action.split("Swipe (")[-1].split("), (")[0].split(", ") + coordinate2 = action.split("), (")[-1].split(")")[0].split(", ") + x1, y1 = int(coordinate1[0]), int(coordinate1[1]) + x2, y2 = int(coordinate2[0]), int(coordinate2[1]) + radius = 75 + draw = ImageDraw.Draw(image) + draw.ellipse([x1 - radius, y1 - radius, x1 + radius, y1 + radius], outline='red', width=10) + draw.ellipse([x2 - radius, y2 - radius, x2 + radius, y2 + radius], outline='blue', width=10) + + elif "Type" in action: + if "(text)" not in action: + text = action.split("(")[-1].split(")")[0] + else: + text = action.split(" \"")[-1].split("\"")[0] + bot_response = f"Please type the \"{text}\" and upload the current screenshot again." + + elif "Back" in action: + bot_response = f"Please back to previous page and upload the current screenshot again." + + elif "Home" in action: + bot_response = f"Please back to home page and upload the current screenshot again." + + elif "Stop" in action: + bot_response = f"Task completed." + + bot_text1 = "
{}
".format("### Decision ###") + bot_thought = "
{}
".format("Thought: " + thought) + bot_action = "
{}
".format("Action: " + action) + bot_operation = "
{}
".format("Operation: " + summary) + bot_text2 = "
{}
".format("### Memory ###") + bot_memory = "
{}
".format(output_memory) + bot_response = "
{}
".format(bot_response) + if image is not None: + bot_img_html = image_to_base64(image) + bot_response = "
{}
".format(bot_img_html) + bot_response + + chat_log.append(user_msg) + + thought_history.append(thought) + summary_history.append(summary) + action_history.append(action) + + history["thought_history"] = thought_history + history["summary_history"] = summary_history + history["action_history"] = action_history + history["summary"] = summary + history["action"] = action + history["memory"] = memory, + history["memory_switch"] = True, + history["insight"] = insight + history["error_flag"] = error_flag + + query_data = { + "task": "planning", + "instruction": instruction, + "thought_history": thought_history, + "summary_history": summary_history, + "action_history": action_history, + "completed_requirements": "", + "add_info": add_info + } + + response_query = mobile_agent_infer(query_data) + output_planning = response_query["planning"] + if output_planning == "No token": + bot_response = ["
{}
".format("Sorry, the resources can be exhausted today.")] + chat_html = "
{}
".format("".join(bot_response)) + return chatbot_css + chat_html, history, chat_log + + output_planning = output_planning.split("### Completed contents ###")[-1].replace("\n", " ").strip() + history["completed_requirements"] = output_planning + + bot_text3 = "
{}
".format("### Planning ###") + output_planning = "
{}
".format(output_planning) + + chat_log.append(bot_text3) + chat_log.append(output_planning) + chat_log.append(bot_text1) + chat_log.append(bot_thought) + chat_log.append(bot_action) + chat_log.append(bot_operation) + chat_log.append(bot_text2) + chat_log.append(bot_memory) + chat_log.append(bot_response) + + chat_html = "
{}
".format("".join(chat_log)) + + return chatbot_css + chat_html, history, chat_log + + +def lock_input(instruction): + return gr.update(value=instruction, interactive=False), gr.update(value=None) + + +def reset_demo(): + return gr.update(value="", interactive=True), gr.update(value="If you want to tap an icon of an app, use the action \"Open app\"", interactive=True), "
", {}, [] + + +tos_markdown = ("""
+ + + +
+If you like our project, please give us a star ✨ on Github for latest update. + +**Terms of use** +1. Input your instruction in \"Instruction\", for example \"Turn on the dark mode\". +2. You can input helpful operation knowledge in \"Knowledge\". +3. Click \"Submit\" to get the operation. You need to operate your mobile device according to the operation and then upload the screenshot after your operation. +4. The 5 cases in \"Examples\" are a complete flow. Click and submit from top to bottom to experience. +5. Due to limited resources, each operation may take a long time, please be patient and wait. + +**使用说明** +1. 在“Instruction”中输入你的指令,例如“打开深色模式”。 +2. 你可以在“Knowledge”中输入帮助性的操作知识。 +3. 点击“Submit”来获得操作。你需要根据输出来操作手机,并且上传操作后的截图。 +4. “Example”中的5个例子是一个任务。从上到下点击它们并且点击“Submit”来体验。 +5. 由于资源有限,每次操作的时间会比较长,请耐心等待。""") + +title_markdowm = ("""# Mobile-Agent-v2: Mobile Device Operation Assistant with Effective Navigation via Multi-Agent Collaboration""") + +instruction_input = gr.Textbox(label="Instruction", placeholder="Input your instruction") +knowledge_input = gr.Textbox(label="Knowledge", placeholder="Input your knowledge", value="If you want to tap an icon of an app, use the action \"Open app\"") +with gr.Blocks() as demo: + history_state = gr.State(value={}) + history_output = gr.State(value=[]) + with gr.Row(): + gr.Markdown(title_markdowm) + with gr.Row(): + with gr.Column(scale=5): + gr.Markdown(tos_markdown) + with gr.Row(): + image_input = gr.Image(label="Screenshot", type="pil", height=550, width=230) + gr.Examples(examples=[ + ["./example/example_1.jpg", "Turn on the dark mode"], + ["./example/example_2.jpg", "Turn on the dark mode"], + ["./example/example_3.jpg", "Turn on the dark mode"], + ["./example/example_4.jpg", "Turn on the dark mode"], + ["./example/example_5.jpg", "Turn on the dark mode"], + ], inputs=[image_input, instruction_input, knowledge_input]) + + with gr.Column(scale=6): + instruction_input.render() + knowledge_input.render() + with gr.Row(): + start_button = gr.Button("Submit") + clear_button = gr.Button("Clear") + output_component = gr.HTML(label="Chat history", value="
") + + start_button.click( + fn=lambda image, instruction, add_info, history, output: chatbot(image, instruction, add_info, history, output), + inputs=[image_input, instruction_input, knowledge_input, history_state, history_output], + outputs=[output_component, history_state, history_output] + ) + + clear_button.click( + fn=reset_demo, + inputs=[], + outputs=[instruction_input, knowledge_input, output_component, history_state, history_output] + ) + +demo.queue().launch(share=True) \ No newline at end of file diff --git a/cache/1.png b/cache/1.png new file mode 100644 index 0000000000000000000000000000000000000000..1d93362c400f8f9ca303fb734e630defb3ff2a79 Binary files /dev/null and b/cache/1.png differ diff --git a/cache/10.png b/cache/10.png new file mode 100644 index 0000000000000000000000000000000000000000..6220b850fe9121c04b60671ffaf0178b919ac3b7 Binary files /dev/null and b/cache/10.png differ diff --git a/cache/11.png b/cache/11.png new file mode 100644 index 0000000000000000000000000000000000000000..1d93362c400f8f9ca303fb734e630defb3ff2a79 Binary files /dev/null and b/cache/11.png differ diff --git a/cache/12.png b/cache/12.png new file mode 100644 index 0000000000000000000000000000000000000000..4b35867fd1af91c2ecfea7f55025d9761cf238de Binary files /dev/null and b/cache/12.png differ diff --git a/cache/13.png b/cache/13.png new file mode 100644 index 0000000000000000000000000000000000000000..d737b5690dc22791496d21fa38c7f46270f313c1 Binary files /dev/null and b/cache/13.png differ diff --git a/cache/14.png b/cache/14.png new file mode 100644 index 0000000000000000000000000000000000000000..1d93362c400f8f9ca303fb734e630defb3ff2a79 Binary files /dev/null and b/cache/14.png differ diff --git a/cache/15.png b/cache/15.png new file mode 100644 index 0000000000000000000000000000000000000000..1d93362c400f8f9ca303fb734e630defb3ff2a79 Binary files /dev/null and b/cache/15.png differ diff --git a/cache/16.png b/cache/16.png new file mode 100644 index 0000000000000000000000000000000000000000..1d93362c400f8f9ca303fb734e630defb3ff2a79 Binary files /dev/null and b/cache/16.png differ diff --git a/cache/17.png b/cache/17.png new file mode 100644 index 0000000000000000000000000000000000000000..1d93362c400f8f9ca303fb734e630defb3ff2a79 Binary files /dev/null and b/cache/17.png differ diff --git a/cache/18.png b/cache/18.png new file mode 100644 index 0000000000000000000000000000000000000000..4b35867fd1af91c2ecfea7f55025d9761cf238de Binary files /dev/null and b/cache/18.png differ diff --git a/cache/19.png b/cache/19.png new file mode 100644 index 0000000000000000000000000000000000000000..1d93362c400f8f9ca303fb734e630defb3ff2a79 Binary files /dev/null and b/cache/19.png differ diff --git a/cache/2.png b/cache/2.png new file mode 100644 index 0000000000000000000000000000000000000000..1d93362c400f8f9ca303fb734e630defb3ff2a79 Binary files /dev/null and b/cache/2.png differ diff --git a/cache/20.png b/cache/20.png new file mode 100644 index 0000000000000000000000000000000000000000..1d93362c400f8f9ca303fb734e630defb3ff2a79 Binary files /dev/null and b/cache/20.png differ diff --git a/cache/21.png b/cache/21.png new file mode 100644 index 0000000000000000000000000000000000000000..4b35867fd1af91c2ecfea7f55025d9761cf238de Binary files /dev/null and b/cache/21.png differ diff --git a/cache/22.png b/cache/22.png new file mode 100644 index 0000000000000000000000000000000000000000..1d93362c400f8f9ca303fb734e630defb3ff2a79 Binary files /dev/null and b/cache/22.png differ diff --git a/cache/23.png b/cache/23.png new file mode 100644 index 0000000000000000000000000000000000000000..1d93362c400f8f9ca303fb734e630defb3ff2a79 Binary files /dev/null and b/cache/23.png differ diff --git a/cache/24.png b/cache/24.png new file mode 100644 index 0000000000000000000000000000000000000000..4b35867fd1af91c2ecfea7f55025d9761cf238de Binary files /dev/null and b/cache/24.png differ diff --git a/cache/25.png b/cache/25.png new file mode 100644 index 0000000000000000000000000000000000000000..1d93362c400f8f9ca303fb734e630defb3ff2a79 Binary files /dev/null and b/cache/25.png differ diff --git a/cache/3.png b/cache/3.png new file mode 100644 index 0000000000000000000000000000000000000000..4b35867fd1af91c2ecfea7f55025d9761cf238de Binary files /dev/null and b/cache/3.png differ diff --git a/cache/4.png b/cache/4.png new file mode 100644 index 0000000000000000000000000000000000000000..1d93362c400f8f9ca303fb734e630defb3ff2a79 Binary files /dev/null and b/cache/4.png differ diff --git a/cache/5.png b/cache/5.png new file mode 100644 index 0000000000000000000000000000000000000000..4b35867fd1af91c2ecfea7f55025d9761cf238de Binary files /dev/null and b/cache/5.png differ diff --git a/cache/6.png b/cache/6.png new file mode 100644 index 0000000000000000000000000000000000000000..1d93362c400f8f9ca303fb734e630defb3ff2a79 Binary files /dev/null and b/cache/6.png differ diff --git a/cache/7.png b/cache/7.png new file mode 100644 index 0000000000000000000000000000000000000000..4b35867fd1af91c2ecfea7f55025d9761cf238de Binary files /dev/null and b/cache/7.png differ diff --git a/cache/8.png b/cache/8.png new file mode 100644 index 0000000000000000000000000000000000000000..d737b5690dc22791496d21fa38c7f46270f313c1 Binary files /dev/null and b/cache/8.png differ diff --git a/cache/9.png b/cache/9.png new file mode 100644 index 0000000000000000000000000000000000000000..9fdc0d0d842eecff86b4f2904780d89ec954fcf0 Binary files /dev/null and b/cache/9.png differ diff --git a/example/example_1.jpg b/example/example_1.jpg new file mode 100644 index 0000000000000000000000000000000000000000..6642c7c0a8c9d681e8ae532e7f4c7119b1f275e7 Binary files /dev/null and b/example/example_1.jpg differ diff --git a/example/example_2.jpg b/example/example_2.jpg new file mode 100644 index 0000000000000000000000000000000000000000..0237dd93979b797b400c5b5b1abb547d273aed71 Binary files /dev/null and b/example/example_2.jpg differ diff --git a/example/example_3.jpg b/example/example_3.jpg new file mode 100644 index 0000000000000000000000000000000000000000..a27c23eae90cf8e061b718b4bd54e308444624b4 Binary files /dev/null and b/example/example_3.jpg differ diff --git a/example/example_4.jpg b/example/example_4.jpg new file mode 100644 index 0000000000000000000000000000000000000000..f1df9a910ff8597b445b285a52ee2b07cdee7e78 Binary files /dev/null and b/example/example_4.jpg differ diff --git a/example/example_5.jpg b/example/example_5.jpg new file mode 100644 index 0000000000000000000000000000000000000000..ee790796ba5a6fabecd781bfce673b6d2bbabec1 Binary files /dev/null and b/example/example_5.jpg differ diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..ee3e93103c65c39408ae9e7ba8b7deb9929eaa68 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,2 @@ +requests +pillow \ No newline at end of file diff --git a/screenshot/screenshot_local.png b/screenshot/screenshot_local.png new file mode 100644 index 0000000000000000000000000000000000000000..1d93362c400f8f9ca303fb734e630defb3ff2a79 Binary files /dev/null and b/screenshot/screenshot_local.png differ