Spaces:

junyangwang0410
/

Mobile-Agent

Running

App Files Files Community

阳渠 commited on Jun 27

Commit

1e96bca

•

1 Parent(s): 8572674

Mobile-Agent-v2

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

MobileAgent/__pycache__/api.cpython-310.pyc +0 -0
MobileAgent/__pycache__/api_service.cpython-310.pyc +0 -0
MobileAgent/__pycache__/chat.cpython-310.pyc +0 -0
MobileAgent/__pycache__/controller.cpython-310.pyc +0 -0
MobileAgent/__pycache__/crop.cpython-310.pyc +0 -0
MobileAgent/__pycache__/icon_localization.cpython-310.pyc +0 -0
MobileAgent/__pycache__/local_server.cpython-310.pyc +0 -0
MobileAgent/__pycache__/prompt.cpython-310.pyc +0 -0
MobileAgent/__pycache__/prompt_no_input.cpython-310.pyc +0 -0
MobileAgent/__pycache__/text_localization.cpython-310.pyc +0 -0
MobileAgent/api.py +45 -0
MobileAgent/api_service.py +26 -0
MobileAgent/chat.py +86 -0
MobileAgent/crop.py +141 -0
MobileAgent/icon_localization.py +59 -0
MobileAgent/local_server.py +172 -0
MobileAgent/prompt_no_input.py +174 -0
MobileAgent/text_localization.py +58 -0
README.md +5 -4
app.py +465 -0
cache/1.png +0 -0
cache/10.png +0 -0
cache/11.png +0 -0
cache/12.png +0 -0
cache/13.png +0 -0
cache/14.png +0 -0
cache/15.png +0 -0
cache/16.png +0 -0
cache/17.png +0 -0
cache/18.png +0 -0
cache/19.png +0 -0
cache/2.png +0 -0
cache/20.png +0 -0
cache/21.png +0 -0
cache/22.png +0 -0
cache/23.png +0 -0
cache/24.png +0 -0
cache/25.png +0 -0
cache/3.png +0 -0
cache/4.png +0 -0
cache/5.png +0 -0
cache/6.png +0 -0
cache/7.png +0 -0
cache/8.png +0 -0
cache/9.png +0 -0
example/example_1.jpg +0 -0
example/example_2.jpg +0 -0
example/example_3.jpg +0 -0
example/example_4.jpg +0 -0
example/example_5.jpg +0 -0

MobileAgent/__pycache__/api.cpython-310.pyc ADDED Viewed

Binary file (1.18 kB). View file

MobileAgent/__pycache__/api_service.cpython-310.pyc ADDED Viewed

Binary file (633 Bytes). View file

MobileAgent/__pycache__/chat.cpython-310.pyc ADDED Viewed

Binary file (1.92 kB). View file

MobileAgent/__pycache__/controller.cpython-310.pyc ADDED Viewed

Binary file (4.05 kB). View file

MobileAgent/__pycache__/crop.cpython-310.pyc ADDED Viewed

Binary file (3.9 kB). View file

MobileAgent/__pycache__/icon_localization.cpython-310.pyc ADDED Viewed

Binary file (1.77 kB). View file

MobileAgent/__pycache__/local_server.cpython-310.pyc ADDED Viewed

Binary file (4.25 kB). View file

MobileAgent/__pycache__/prompt.cpython-310.pyc ADDED Viewed

Binary file (9.8 kB). View file

MobileAgent/__pycache__/prompt_no_input.cpython-310.pyc ADDED Viewed

Binary file (9.04 kB). View file

MobileAgent/__pycache__/text_localization.cpython-310.pyc ADDED Viewed

Binary file (1.98 kB). View file

MobileAgent/api.py ADDED Viewed

	@@ -0,0 +1,45 @@

+import base64
+import requests
+def encode_image(image_path):
+    with open(image_path, "rb") as image_file:
+        return base64.b64encode(image_file.read()).decode('utf-8')
+def inference_chat(chat, model, api_url, token):
+    headers = {
+        "Content-Type": "application/json",
+        "Authorization": f"Bearer {token}"
+    }
+    data = {
+        "model": model,
+        "messages": [],
+        "max_tokens": 2048,
+        'temperature': 0.0,
+        "seed": 1234
+    }
+    for role, content in chat:
+        data["messages"].append({"role": role, "content": content})
+    retry = 3
+    cur_try = 0
+    while True:
+        cur_try += 1
+        if cur_try > retry:
+            return "No token"
+        try:
+            res = requests.post(api_url, headers=headers, json=data)
+            res_json = res.json()
+            res_content = res_json['data']['response']['choices'][0]['message']['content']
+        except:
+            print("Network Error:")
+            try:
+                print(res.json())
+            except:
+                print("Request Failed")
+        else:
+            break
+    return res_content

MobileAgent/api_service.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import requests
+import json
+def get_action(query_data, url, token):
+    headers = {
+        'Authorization': token,
+        'Content-Type': 'application/json'
+    }
+    data = {
+        "model": "pre-Mobile_Agent_Server_ADB_V2-2204",
+        "input": {"json_data": query_data}
+    }
+    while True:
+        try:
+            response = requests.post(url, headers=headers, data=json.dumps(data))
+            response.json()["output"]
+        except:
+            print("Network Error:", response.json())
+        else:
+            break
+    return response

MobileAgent/chat.py ADDED Viewed

	@@ -0,0 +1,86 @@

+import copy
+from MobileAgent.api import encode_image
+def init_action_chat():
+    operation_history = []
+    sysetm_prompt = "You are a helpful AI mobile phone operating assistant. You need to help me operate the phone to complete the user\'s instruction."
+    operation_history.append(["system", [{"type": "text", "text": sysetm_prompt}]])
+    return operation_history
+def init_reflect_chat():
+    operation_history = []
+    sysetm_prompt = "You are a helpful AI mobile phone operating assistant."
+    operation_history.append(["system", [{"type": "text", "text": sysetm_prompt}]])
+    return operation_history
+def init_memory_chat():
+    operation_history = []
+    sysetm_prompt = "You are a helpful AI mobile phone operating assistant."
+    operation_history.append(["system", [{"type": "text", "text": sysetm_prompt}]])
+    return operation_history
+def add_response(role, prompt, chat_history, image=None):
+    new_chat_history = copy.deepcopy(chat_history)
+    if image:
+        base64_image = encode_image(image)
+        content = [
+            {
+                "type": "text",
+                "text": prompt
+            },
+            {
+                "type": "image_url",
+                "image_url": {
+                    "url": f"data:image/jpeg;base64,{base64_image}"
+                }
+            },
+        ]
+    else:
+        content = [
+            {
+            "type": "text",
+            "text": prompt
+            },
+        ]
+    new_chat_history.append([role, content])
+    return new_chat_history
+def add_response_two_image(role, prompt, chat_history, image):
+    new_chat_history = copy.deepcopy(chat_history)
+    base64_image1 = encode_image(image[0])
+    base64_image2 = encode_image(image[1])
+    content = [
+        {
+            "type": "text",
+            "text": prompt
+        },
+        {
+            "type": "image_url",
+            "image_url": {
+                "url": f"data:image/jpeg;base64,{base64_image1}"
+            }
+        },
+        {
+            "type": "image_url",
+            "image_url": {
+                "url": f"data:image/jpeg;base64,{base64_image2}"
+            }
+        },
+    ]
+    new_chat_history.append([role, content])
+    return new_chat_history
+def print_status(chat_history):
+    print("*"*100)
+    for chat in chat_history:
+        print("role:", chat[0])
+        print(chat[1][0]["text"] + "<image>"*(len(chat[1])-1) + "\n")
+    print("*"*100)

MobileAgent/crop.py ADDED Viewed

	@@ -0,0 +1,141 @@

+import math
+import cv2
+import numpy as np
+from PIL import Image, ImageDraw, ImageFont
+import clip
+import torch
+def crop_image(img, position):
+    def distance(x1,y1,x2,y2):
+        return math.sqrt(pow(x1 - x2, 2) + pow(y1 - y2, 2))
+    position = position.tolist()
+    for i in range(4):
+        for j in range(i+1, 4):
+            if(position[i][0] > position[j][0]):
+                tmp = position[j]
+                position[j] = position[i]
+                position[i] = tmp
+    if position[0][1] > position[1][1]:
+        tmp = position[0]
+        position[0] = position[1]
+        position[1] = tmp
+    if position[2][1] > position[3][1]:
+        tmp = position[2]
+        position[2] = position[3]
+        position[3] = tmp
+    x1, y1 = position[0][0], position[0][1]
+    x2, y2 = position[2][0], position[2][1]
+    x3, y3 = position[3][0], position[3][1]
+    x4, y4 = position[1][0], position[1][1]
+    corners = np.zeros((4,2), np.float32)
+    corners[0] = [x1, y1]
+    corners[1] = [x2, y2]
+    corners[2] = [x4, y4]
+    corners[3] = [x3, y3]
+    img_width = distance((x1+x4)/2, (y1+y4)/2, (x2+x3)/2, (y2+y3)/2)
+    img_height = distance((x1+x2)/2, (y1+y2)/2, (x4+x3)/2, (y4+y3)/2)
+    corners_trans = np.zeros((4,2), np.float32)
+    corners_trans[0] = [0, 0]
+    corners_trans[1] = [img_width - 1, 0]
+    corners_trans[2] = [0, img_height - 1]
+    corners_trans[3] = [img_width - 1, img_height - 1]
+    transform = cv2.getPerspectiveTransform(corners, corners_trans)
+    dst = cv2.warpPerspective(img, transform, (int(img_width), int(img_height)))
+    return dst
+def calculate_size(box):
+    return (box[2]-box[0]) * (box[3]-box[1])
+def calculate_iou(box1, box2):
+    xA = max(box1[0], box2[0])
+    yA = max(box1[1], box2[1])
+    xB = min(box1[2], box2[2])
+    yB = min(box1[3], box2[3])
+    interArea = max(0, xB - xA) * max(0, yB - yA)
+    box1Area = (box1[2] - box1[0]) * (box1[3] - box1[1])
+    box2Area = (box2[2] - box2[0]) * (box2[3] - box2[1])
+    unionArea = box1Area + box2Area - interArea
+    iou = interArea / unionArea
+    return iou
+def crop(image, box, i, text_data=None):
+    image = Image.open(image)
+    if text_data:
+        draw = ImageDraw.Draw(image)
+        draw.rectangle(((text_data[0], text_data[1]), (text_data[2], text_data[3])), outline="red", width=5)
+        # font_size = int((text_data[3] - text_data[1])*0.75)
+        # font = ImageFont.truetype("arial.ttf", font_size)
+        # draw.text((text_data[0]+5, text_data[1]+5), str(i), font=font, fill="red")
+    cropped_image = image.crop(box)
+    cropped_image.save(f"./temp/{i}.jpg")
+def in_box(box, target):
+    if (box[0] > target[0]) and (box[1] > target[1]) and (box[2] < target[2]) and (box[3] < target[3]):
+        return True
+    else:
+        return False
+def crop_for_clip(image, box, i, position):
+    image = Image.open(image)
+    w, h = image.size
+    if position == "left":
+        bound = [0, 0, w/2, h]
+    elif position == "right":
+        bound = [w/2, 0, w, h]
+    elif position == "top":
+        bound = [0, 0, w, h/2]
+    elif position == "bottom":
+        bound = [0, h/2, w, h]
+    elif position == "top left":
+        bound = [0, 0, w/2, h/2]
+    elif position == "top right":
+        bound = [w/2, 0, w, h/2]
+    elif position == "bottom left":
+        bound = [0, h/2, w/2, h]
+    elif position == "bottom right":
+        bound = [w/2, h/2, w, h]
+    else:
+        bound = [0, 0, w, h]
+    if in_box(box, bound):
+        cropped_image = image.crop(box)
+        cropped_image.save(f"./temp/{i}.jpg")
+        return True
+    else:
+        return False
+def clip_for_icon(clip_model, clip_preprocess, images, prompt):
+    image_features = []
+    for image_file in images:
+        image = clip_preprocess(Image.open(image_file)).unsqueeze(0).to(next(clip_model.parameters()).device)
+        image_feature = clip_model.encode_image(image)
+        image_features.append(image_feature)
+    image_features = torch.cat(image_features)
+    text = clip.tokenize([prompt]).to(next(clip_model.parameters()).device)
+    text_features = clip_model.encode_text(text)
+    image_features /= image_features.norm(dim=-1, keepdim=True)
+    text_features /= text_features.norm(dim=-1, keepdim=True)
+    similarity = (100.0 * image_features @ text_features.T).softmax(dim=0).squeeze(0)
+    _, max_pos = torch.max(similarity, dim=0)
+    pos = max_pos.item()
+    return pos

MobileAgent/icon_localization.py ADDED Viewed

	@@ -0,0 +1,59 @@

+from MobileAgent.crop import calculate_size, calculate_iou
+from modelscope.pipelines import pipeline
+from PIL import Image
+import torch
+def remove_boxes(boxes_filt, size, iou_threshold=0.5):
+    boxes_to_remove = set()
+    for i in range(len(boxes_filt)):
+        if calculate_size(boxes_filt[i]) > 0.05*size[0]*size[1]:
+            boxes_to_remove.add(i)
+        for j in range(len(boxes_filt)):
+            if calculate_size(boxes_filt[j]) > 0.05*size[0]*size[1]:
+                boxes_to_remove.add(j)
+            if i == j:
+                continue
+            if i in boxes_to_remove or j in boxes_to_remove:
+                continue
+            iou = calculate_iou(boxes_filt[i], boxes_filt[j])
+            if iou >= iou_threshold:
+                boxes_to_remove.add(j)
+    boxes_filt = [box for idx, box in enumerate(boxes_filt) if idx not in boxes_to_remove]
+    return boxes_filt
+def det(input_image_path, caption, groundingdino_model, box_threshold=0.05, text_threshold=0.5):
+    image = Image.open(input_image_path)
+    size = image.size
+    caption = caption.lower()
+    caption = caption.strip()
+    if not caption.endswith('.'):
+        caption = caption + '.'
+    inputs = {
+        'IMAGE_PATH': input_image_path,
+        'TEXT_PROMPT': caption,
+        'BOX_TRESHOLD': box_threshold,
+        'TEXT_TRESHOLD': text_threshold
+    }
+    result = groundingdino_model(inputs)
+    boxes_filt = result['boxes']
+    H, W = size[1], size[0]
+    for i in range(boxes_filt.size(0)):
+        boxes_filt[i] = boxes_filt[i] * torch.Tensor([W, H, W, H])
+        boxes_filt[i][:2] -= boxes_filt[i][2:] / 2
+        boxes_filt[i][2:] += boxes_filt[i][:2]
+    boxes_filt = boxes_filt.cpu().int().tolist()
+    filtered_boxes = remove_boxes(boxes_filt, size)  # [:9]
+    coordinates = []
+    for box in filtered_boxes:
+        coordinates.append([box[0], box[1], box[2], box[3]])
+    return coordinates

MobileAgent/local_server.py ADDED Viewed

	@@ -0,0 +1,172 @@

+import os
+import base64
+from io import BytesIO
+from PIL import Image
+from MobileAgent.api import inference_chat
+from MobileAgent.prompt_no_input import get_action_prompt, get_reflect_prompt, get_memory_prompt, get_process_prompt
+from MobileAgent.chat import init_action_chat, init_reflect_chat, init_memory_chat, add_response, add_response_two_image
+from dashscope import MultiModalConversation
+import dashscope
+import concurrent
+API_url = os.environ.get('url')
+token = os.environ.get('token')
+def base64_to_pil(base64_string):
+    if base64_string.startswith('data:image'):
+        base64_string = base64_string.split(',')[-1]
+    image_data = base64.b64decode(base64_string)
+    image_stream = BytesIO(image_data)
+    pil_image = Image.open(image_stream)
+    return pil_image
+def process_image(image, query):
+    dashscope.api_key = os.environ.get('qwen')
+    image = "file://" + image
+    messages = [{
+        'role': 'user',
+        'content': [
+            {
+                'image': image
+            },
+            {
+                'text': query
+            },
+        ]
+    }]
+    response = MultiModalConversation.call(model="qwen-vl-plus", messages=messages)
+    try:
+        response = response['output']['choices'][0]['message']['content'][0]["text"]
+    except:
+        response = "This is an icon."
+    return response
+if not os.path.exists("screenshot"):
+    os.mkdir("screenshot")
+if not os.path.exists("temp"):
+    os.mkdir("temp")
+def mobile_agent_infer(json_data):
+    task = json_data["task"]
+    if task == "caption":
+        query = json_data["query"]
+        images = json_data["images"]
+        local_images = []
+        for image in images:
+            image_name = image["image_name"]
+            image_file = image["image_file"]
+            image_file = base64_to_pil(image_file)
+            image_path = "temp/" + image_name
+            image_file.save(image_path, "PNG")
+            local_images.append(image_path)
+        icon_map = {}
+        with concurrent.futures.ThreadPoolExecutor() as executor:
+            futures = {executor.submit(process_image, image, query): i for i, image in enumerate(local_images)}
+            for future in concurrent.futures.as_completed(futures):
+                i = futures[future]
+                response = future.result()
+                icon_map[i + 1] = response
+        output = {"icon_map": icon_map}
+        return output
+    elif task == "planning":
+        instruction = json_data["instruction"]
+        thought_history = json_data["thought_history"]
+        summary_history = json_data["summary_history"]
+        action_history = json_data["action_history"]
+        completed_requirements = json_data["completed_requirements"]
+        add_info = json_data["add_info"]
+        prompt_planning = get_process_prompt(instruction, thought_history, summary_history, action_history,
+                                             completed_requirements, add_info)
+        chat_planning = init_memory_chat()
+        chat_planning = add_response("user", prompt_planning, chat_planning)
+        output_planning = inference_chat(chat_planning, 'gpt-4-turbo', API_url, token)
+        output = {"planning": output_planning}
+        return output
+    elif task == "decision":
+        screenshot_file = json_data["screenshot_file"]
+        screenshot_file = base64_to_pil(screenshot_file)
+        image_path = "screenshot/screenshot_local.png"
+        screenshot_file.save(image_path, "PNG")
+        instruction = json_data["instruction"]
+        perception_infos = json_data["perception_infos"]
+        width = json_data["width"]
+        height = json_data["height"]
+        summary_history = json_data["summary_history"]
+        action_history = json_data["action_history"]
+        summary = json_data["summary"]
+        action = json_data["action"]
+        add_info = json_data["add_info"]
+        error_flag = json_data["error_flag"]
+        completed_requirements = json_data["completed_requirements"]
+        memory = json_data["memory"]
+        memory_switch = json_data["memory_switch"]
+        insight = json_data["insight"]
+        prompt_action = get_action_prompt(instruction, perception_infos, width, height, summary_history,
+                                          action_history, summary, action, add_info, error_flag, completed_requirements,
+                                          memory)
+        chat_action = init_action_chat()
+        chat_action = add_response("user", prompt_action, chat_action, image_path)
+        output_action = inference_chat(chat_action, 'gpt-4o', API_url, token)
+        if output_action == "No token":
+            output = {"decision": "No token", "memory": None}
+            return output
+        chat_action = add_response("assistant", output_action, chat_action)
+        output_memory = None
+        if memory_switch:
+            prompt_memory = get_memory_prompt(insight)
+            chat_action = add_response("user", prompt_memory, chat_action)
+            output_memory = inference_chat(chat_action, 'gpt-4o', API_url, token)
+        output = {"decision": output_action, "memory": output_memory}
+        return output
+    elif task == "reflection":
+        screenshot_file = json_data["screenshot_file"]
+        screenshot_file = base64_to_pil(screenshot_file)
+        image_path = "screenshot/screenshot_local.png"
+        screenshot_file.save(image_path, "PNG")
+        last_screenshot_file = json_data["last_screenshot_file"]
+        last_screenshot_file = base64_to_pil(last_screenshot_file)
+        last_image_path = "screenshot/last_screenshot_local.png"
+        last_screenshot_file.save(last_image_path, "PNG")
+        instruction = json_data["instruction"]
+        last_perception_infos = json_data["last_perception_infos"]
+        perception_infos = json_data["perception_infos"]
+        width = json_data["width"]
+        height = json_data["height"]
+        summary = json_data["summary"]
+        action = json_data["action"]
+        add_info = json_data["add_info"]
+        prompt_reflect = get_reflect_prompt(instruction, last_perception_infos, perception_infos, width, height,
+                                            summary, action, add_info)
+        chat_reflect = init_reflect_chat()
+        chat_reflect = add_response_two_image("user", prompt_reflect, chat_reflect, [last_image_path, image_path])
+        output_reflect = inference_chat(chat_reflect, 'gpt-4o', API_url, token)
+        output = {"reflection": output_reflect}
+        return output
+    else:
+        output = {"error": "The task must be in \"caption\", \"planning\", \"decision\" and \"reflection\"."}
+        return output

MobileAgent/prompt_no_input.py ADDED Viewed

	@@ -0,0 +1,174 @@

+def get_action_prompt(instruction, clickable_infos, width, height, summary_history, action_history, last_summary, last_action, add_info, error_flag, completed_content, memory):
+    prompt = "### Background ###\n"
+    prompt += f"This image is a phone screenshot. Its width is {width} pixels and its height is {height} pixels. The user\'s instruction is: {instruction}.\n\n"
+    prompt += "### Screenshot information ###\n"
+    prompt += "In order to help you better perceive the content in this screenshot, we extract some information on the current screenshot through system files. "
+    prompt += "This information consists of two parts: coordinates; content. "
+    prompt += "The format of the coordinates is [x, y], x is the pixel from left to right and y is the pixel from top to bottom; the content is a text or an icon description respectively. "
+    prompt += "The information is as follow:\n"
+    for clickable_info in clickable_infos:
+        if clickable_info['text'] != "" and clickable_info['text'] != "icon: None" and clickable_info['coordinates'] != (0, 0):
+            prompt += f"{clickable_info['coordinates']}; {clickable_info['text']}\n"
+    prompt += "Please note that this information is not necessarily accurate. You need to combine the screenshot to understand."
+    prompt += "\n\n"
+    if add_info != "":
+        prompt += "### Hint ###\n"
+        prompt += "There are hints to help you complete the user\'s instructions. The hints are as follow:\n"
+        prompt += add_info
+        prompt += "\n\n"
+    if len(action_history) > 0:
+        prompt += "### History operations ###\n"
+        prompt += "Before reaching this page, some operations have been completed. You need to refer to the completed operations to decide the next operation. These operations are as follow:\n"
+        for i in range(len(action_history)):
+            prompt += f"Step-{i+1}: [Operation: " + summary_history[i].split(" to ")[0].strip() + "; Action: " + action_history[i] + "]\n"
+        prompt += "\n"
+    if completed_content != "":
+        prompt += "### Progress ###\n"
+        prompt += "After completing the history operations, you have the following thoughts about the progress of user\'s instruction completion:\n"
+        prompt += "Completed contents:\n" + completed_content + "\n\n"
+    if memory != "":
+        prompt += "### Memory ###\n"
+        prompt += "During the operations, you record the following contents on the screenshot for use in subsequent operations:\n"
+        prompt += "Memory:\n" + memory + "\n"
+    if error_flag:
+        prompt += "### Last operation ###\n"
+        prompt += f"You previously wanted to perform the operation \"{last_summary}\" on this page and executed the Action \"{last_action}\". But you find that this operation does not meet your expectation. You need to reflect and revise your operation this time."
+        prompt += "\n\n"
+    prompt += "### Response requirements ###\n"
+    prompt += "Now you need to combine all of the above to perform just one action on the current page. You must choose one of the six actions below:\n"
+    prompt += "Open app (app name): If the current page is desktop, you can use this action to open the app named \"app name\" on the desktop.\n"
+    prompt += "Tap (x, y): Tap the position (x, y) in current page.\n"
+    prompt += "Swipe (x1, y1), (x2, y2): Swipe from position (x1, y1) to position (x2, y2).\n"
+    prompt += "Type (text): Type the \"text\" in the input box.\n"
+    prompt += "Home: Return to home page.\n"
+    prompt += "Stop: If you think all the requirements of user\'s instruction have been completed and no further operation is required, you can choose this action to terminate the operation process."
+    prompt += "\n\n"
+    prompt += "### Output format ###\n"
+    prompt += "Your output consists of the following three parts:\n"
+    prompt += "### Thought ###\nThink about the requirements that have been completed in previous operations and the requirements that need to be completed in the next one operation.\n"
+    prompt += "### Action ###\nYou can only choose one from the six actions above. Make sure that the coordinates or text in the \"()\".\n"
+    prompt += "### Operation ###\nPlease generate a brief natural language description for the operation in Action based on your Thought."
+    return prompt
+def get_reflect_prompt(instruction, clickable_infos1, clickable_infos2, width, height, summary, action, add_info):
+    prompt = f"These images are two phone screenshots before and after an operation. Their widths are {width} pixels and their heights are {height} pixels.\n\n"
+    prompt += "In order to help you better perceive the content in this screenshot, we extract some information on the current screenshot through system files. "
+    prompt += "The information consists of two parts, consisting of format: coordinates; content. "
+    prompt += "The format of the coordinates is [x, y], x is the pixel from left to right and y is the pixel from top to bottom; the content is a text or an icon description respectively "
+    prompt += "The keyboard status is whether the keyboard of the current page is activated."
+    prompt += "\n\n"
+    prompt += "### Before the current operation ###\n"
+    prompt += "Screenshot information:\n"
+    for clickable_info in clickable_infos1:
+        if clickable_info['text'] != "" and clickable_info['text'] != "icon: None" and clickable_info['coordinates'] != (0, 0):
+            prompt += f"{clickable_info['coordinates']}; {clickable_info['text']}\n"
+    prompt += "\n"
+    prompt += "### After the current operation ###\n"
+    prompt += "Screenshot information:\n"
+    for clickable_info in clickable_infos2:
+        if clickable_info['text'] != "" and clickable_info['text'] != "icon: None" and clickable_info['coordinates'] != (0, 0):
+            prompt += f"{clickable_info['coordinates']}; {clickable_info['text']}\n"
+    prompt += "\n"
+    prompt += "### Current operation ###\n"
+    prompt += f"The user\'s instruction is: {instruction}. You also need to note the following requirements: {add_info}. In the process of completing the requirements of instruction, an operation is performed on the phone. Below are the details of this operation:\n"
+    prompt += "Operation thought: " + summary.split(" to ")[0].strip() + "\n"
+    prompt += "Operation action: " + action
+    prompt += "\n\n"
+    prompt += "### Response requirements ###\n"
+    prompt += "Now you need to output the following content based on the screenshots before and after the current operation:\n"
+    prompt += "Whether the result of the \"Operation action\" meets your expectation of \"Operation thought\"?\n"
+    prompt += "A: The result of the \"Operation action\" meets my expectation of \"Operation thought\".\n"
+    prompt += "B: The \"Operation action\" results in a wrong page and I need to return to the previous page.\n"
+    prompt += "C: The \"Operation action\" produces no changes."
+    prompt += "\n\n"
+    prompt += "### Output format ###\n"
+    prompt += "Your output format is:\n"
+    prompt += "### Thought ###\nYour thought about the question\n"
+    prompt += "### Answer ###\nA or B or C"
+    return prompt
+def get_memory_prompt(insight):
+    if insight != "":
+        prompt  = "### Important content ###\n"
+        prompt += insight
+        prompt += "\n\n"
+        prompt += "### Response requirements ###\n"
+        prompt += "Please think about whether there is any content closely related to ### Important content ### on the current page? If there is, please output the content. If not, please output \"None\".\n\n"
+    else:
+        prompt  = "### Response requirements ###\n"
+        prompt += "Please think about whether there is any content closely related to user\'s instrcution on the current page? If there is, please output the content. If not, please output \"None\".\n\n"
+    prompt += "### Output format ###\n"
+    prompt += "Your output format is:\n"
+    prompt += "### Important content ###\nThe content or None. Please do not repeatedly output the information in ### Memory ###."
+    return prompt
+def get_process_prompt(instruction, thought_history, summary_history, action_history, completed_content, add_info):
+    prompt = "### Background ###\n"
+    prompt += f"There is an user\'s instruction which is: {instruction}. You are a mobile phone operating assistant and are operating the user\'s mobile phone.\n\n"
+    if add_info != "":
+        prompt += "### Hint ###\n"
+        prompt += "There are hints to help you complete the user\'s instructions. The hints are as follow:\n"
+        prompt += add_info
+        prompt += "\n\n"
+    if len(thought_history) > 1:
+        prompt += "### History operations ###\n"
+        prompt += "To complete the requirements of user\'s instruction, you have performed a series of operations. These operations are as follow:\n"
+        for i in range(len(summary_history)):
+            operation = summary_history[i].split(" to ")[0].strip()
+            prompt += f"Step-{i+1}: [Operation thought: " + operation + "; Operation action: " + action_history[i] + "]\n"
+        prompt += "\n"
+        prompt += "### Progress thinking ###\n"
+        prompt += "After completing the history operations, you have the following thoughts about the progress of user\'s instruction completion:\n"
+        prompt += "Completed contents:\n" + completed_content + "\n\n"
+        prompt += "### Response requirements ###\n"
+        prompt += "Now you need to update the \"Completed contents\". Completed contents is a general summary of the current contents that have been completed based on the ### History operations ###.\n\n"
+        prompt += "### Output format ###\n"
+        prompt += "Your output format is:\n"
+        prompt += "### Completed contents ###\nUpdated Completed contents. Don\'t output the purpose of any operation. Just summarize the contents that have been actually completed in the ### History operations ###."
+    else:
+        prompt += "### Current operation ###\n"
+        prompt += "To complete the requirements of user\'s instruction, you have performed an operation. Your operation thought and action of this operation are as follows:\n"
+        prompt += f"Operation thought: {thought_history[-1]}\n"
+        operation = summary_history[-1].split(" to ")[0].strip()
+        prompt += f"Operation action: {operation}\n\n"
+        prompt += "### Response requirements ###\n"
+        prompt += "Now you need to combine all of the above to generate the \"Completed contents\".\n"
+        prompt += "Completed contents is a general summary of the current contents that have been completed. You need to first focus on the requirements of user\'s instruction, and then summarize the contents that have been completed.\n\n"
+        prompt += "### Output format ###\n"
+        prompt += "Your output format is:\n"
+        prompt += "### Completed contents ###\nGenerated Completed contents. Don\'t output the purpose of any operation. Just summarize the contents that have been actually completed in the ### Current operation ###.\n"
+        prompt += "(Please use English to output)"
+    return prompt

MobileAgent/text_localization.py ADDED Viewed

	@@ -0,0 +1,58 @@

+import cv2
+import numpy as np
+from MobileAgent.crop import crop_image, calculate_size
+from PIL import Image
+def order_point(coor):
+    arr = np.array(coor).reshape([4, 2])
+    sum_ = np.sum(arr, 0)
+    centroid = sum_ / arr.shape[0]
+    theta = np.arctan2(arr[:, 1] - centroid[1], arr[:, 0] - centroid[0])
+    sort_points = arr[np.argsort(theta)]
+    sort_points = sort_points.reshape([4, -1])
+    if sort_points[0][0] > centroid[0]:
+        sort_points = np.concatenate([sort_points[3:], sort_points[:3]])
+    sort_points = sort_points.reshape([4, 2]).astype('float32')
+    return sort_points
+def longest_common_substring_length(str1, str2):
+    m = len(str1)
+    n = len(str2)
+    dp = [[0] * (n + 1) for _ in range(m + 1)]
+    for i in range(1, m + 1):
+        for j in range(1, n + 1):
+            if str1[i - 1] == str2[j - 1]:
+                dp[i][j] = dp[i - 1][j - 1] + 1
+            else:
+                dp[i][j] = max(dp[i - 1][j], dp[i][j - 1])
+    return dp[m][n]
+def ocr(image_path, ocr_detection, ocr_recognition):
+    text_data = []
+    coordinate = []
+    image_full = cv2.imread(image_path)
+    det_result = ocr_detection(image_full)
+    det_result = det_result['polygons']
+    for i in range(det_result.shape[0]):
+        pts = order_point(det_result[i])
+        image_crop = crop_image(image_full, pts)
+        try:
+            result = ocr_recognition(image_crop)['text'][0]
+        except:
+            continue
+        box = [int(e) for e in list(pts.reshape(-1))]
+        box = [box[0], box[1], box[4], box[5]]
+        text_data.append(result)
+        coordinate.append(box)
+    else:
+        return text_data, coordinate

README.md CHANGED Viewed

@@ -1,12 +1,13 @@
 ---
 title: Mobile Agent
-emoji: 😻
-colorFrom: blue
-colorTo: red
 sdk: gradio
-sdk_version: 4.37.1
 app_file: app.py
 pinned: false
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 title: Mobile Agent
+emoji: 🦀
+colorFrom: indigo
+colorTo: green
 sdk: gradio
+sdk_version: 4.19.1
 app_file: app.py
 pinned: false
+license: mit
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,465 @@

+import io
+import os
+import shutil
+import base64
+import gradio as gr
+from PIL import Image, ImageDraw
+from MobileAgent.text_localization import ocr
+from MobileAgent.icon_localization import det
+from MobileAgent.local_server import mobile_agent_infer
+from modelscope import snapshot_download
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+chatbot_css = """
+<style>
+.chat-container {
+    display: flex;
+    flex-direction: column;
+    overflow-y: auto;
+    max-height: 630px;
+    margin: 10px;
+}
+.user-message, .bot-message {
+    margin: 5px;
+    padding: 10px;
+    border-radius: 10px;
+}
+.user-message {
+    text-align: right;
+    background-color: #7B68EE;
+    color: white;
+    align-self: flex-end;
+}
+.bot-message {
+    text-align: left;
+    background-color: #ADD8E6;
+    color: black;
+    align-self: flex-start;
+}
+.user-image {
+    text-align: right;
+    align-self: flex-end;
+    max-width: 150px;
+    max-height: 300px;
+}
+.bot-image {
+    text-align: left;
+    align-self: flex-start;
+    max-width: 200px;
+    max-height: 400px;
+}
+</style>
+"""
+temp_file = "temp"
+screenshot = "screenshot"
+cache = "cache"
+if not os.path.exists(temp_file):
+    os.mkdir(temp_file)
+if not os.path.exists(screenshot):
+    os.mkdir(screenshot)
+if not os.path.exists(cache):
+    os.mkdir(cache)
+groundingdino_dir = snapshot_download('AI-ModelScope/GroundingDINO', revision='v1.0.0')
+groundingdino_model = pipeline('grounding-dino-task', model=groundingdino_dir)
+ocr_detection = pipeline(Tasks.ocr_detection, model='damo/cv_resnet18_ocr-detection-line-level_damo')
+ocr_recognition = pipeline(Tasks.ocr_recognition, model='damo/cv_convnextTiny_ocr-recognition-document_damo')
+def encode_image(image_path):
+    with open(image_path, "rb") as image_file:
+        return base64.b64encode(image_file.read()).decode('utf-8')
+def get_all_files_in_folder(folder_path):
+    file_list = []
+    for file_name in os.listdir(folder_path):
+        file_list.append(file_name)
+    return file_list
+def crop(image, box, i):
+    image = Image.open(image)
+    x1, y1, x2, y2 = int(box[0]), int(box[1]), int(box[2]), int(box[3])
+    if x1 >= x2-10 or y1 >= y2-10:
+        return
+    cropped_image = image.crop((x1, y1, x2, y2))
+    cropped_image.save(f"./temp/{i}.png", format="PNG")
+def merge_text_blocks(text_list, coordinates_list):
+    merged_text_blocks = []
+    merged_coordinates = []
+    sorted_indices = sorted(range(len(coordinates_list)), key=lambda k: (coordinates_list[k][1], coordinates_list[k][0]))
+    sorted_text_list = [text_list[i] for i in sorted_indices]
+    sorted_coordinates_list = [coordinates_list[i] for i in sorted_indices]
+    num_blocks = len(sorted_text_list)
+    merge = [False] * num_blocks
+    for i in range(num_blocks):
+        if merge[i]:
+            continue
+        anchor = i
+        group_text = [sorted_text_list[anchor]]
+        group_coordinates = [sorted_coordinates_list[anchor]]
+        for j in range(i+1, num_blocks):
+            if merge[j]:
+                continue
+            if abs(sorted_coordinates_list[anchor][0] - sorted_coordinates_list[j][0]) < 10 and \
+            sorted_coordinates_list[j][1] - sorted_coordinates_list[anchor][3] >= -10 and sorted_coordinates_list[j][1] - sorted_coordinates_list[anchor][3] < 30 and \
+            abs(sorted_coordinates_list[anchor][3] - sorted_coordinates_list[anchor][1] - (sorted_coordinates_list[j][3] - sorted_coordinates_list[j][1])) < 10:
+                group_text.append(sorted_text_list[j])
+                group_coordinates.append(sorted_coordinates_list[j])
+                merge[anchor] = True
+                anchor = j
+                merge[anchor] = True
+        merged_text = "\n".join(group_text)
+        min_x1 = min(group_coordinates, key=lambda x: x[0])[0]
+        min_y1 = min(group_coordinates, key=lambda x: x[1])[1]
+        max_x2 = max(group_coordinates, key=lambda x: x[2])[2]
+        max_y2 = max(group_coordinates, key=lambda x: x[3])[3]
+        merged_text_blocks.append(merged_text)
+        merged_coordinates.append([min_x1, min_y1, max_x2, max_y2])
+    return merged_text_blocks, merged_coordinates
+def get_perception_infos(screenshot_file):
+    width, height = Image.open(screenshot_file).size
+    text, coordinates = ocr(screenshot_file, ocr_detection, ocr_recognition)
+    text, coordinates = merge_text_blocks(text, coordinates)
+    perception_infos = []
+    for i in range(len(coordinates)):
+        perception_info = {"text": "text: " + text[i], "coordinates": coordinates[i]}
+        perception_infos.append(perception_info)
+    coordinates = det(screenshot_file, "icon", groundingdino_model)
+    for i in range(len(coordinates)):
+        perception_info = {"text": "icon", "coordinates": coordinates[i]}
+        perception_infos.append(perception_info)
+    image_box = []
+    image_id = []
+    for i in range(len(perception_infos)):
+        if perception_infos[i]['text'] == 'icon':
+            image_box.append(perception_infos[i]['coordinates'])
+            image_id.append(i)
+    for i in range(len(image_box)):
+        crop(screenshot_file, image_box[i], image_id[i])
+    images = get_all_files_in_folder(temp_file)
+    if len(images) > 0:
+        images = sorted(images, key=lambda x: int(x.split('/')[-1].split('.')[0]))
+        image_id = [int(image.split('/')[-1].split('.')[0]) for image in images]
+        icon_map = {}
+        prompt = 'This image is an icon from a phone screen. Please briefly describe the shape and color of this icon in one sentence.'
+        string_image = []
+        for i in range(len(images)):
+            image_path = os.path.join(temp_file, images[i])
+            string_image.append({"image_name": images[i], "image_file": encode_image(image_path)})
+        query_data = {"task": "caption", "images": string_image, "query": prompt}
+        response_query = mobile_agent_infer(query_data)
+        icon_map = response_query["icon_map"]
+        for i, j in zip(image_id, range(1, len(image_id)+1)):
+            if icon_map.get(str(j)):
+                perception_infos[i]['text'] = "icon: " + icon_map[str(j)]
+    for i in range(len(perception_infos)):
+        perception_infos[i]['coordinates'] = [int((perception_infos[i]['coordinates'][0]+perception_infos[i]['coordinates'][2])/2), int((perception_infos[i]['coordinates'][1]+perception_infos[i]['coordinates'][3])/2)]
+    return perception_infos, width, height
+def image_to_base64(image):
+    buffered = io.BytesIO()
+    image.save(buffered, format="PNG")
+    img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
+    img_html = f'<img src="data:image/png;base64,{img_str}" />'
+    return img_html
+def chatbot(image, instruction, add_info, history, chat_log):
+    if history == {}:
+        thought_history = []
+        summary_history = []
+        action_history = []
+        summary = ""
+        action = ""
+        completed_requirements = ""
+        memory = ""
+        insight = ""
+        error_flag = False
+        user_msg = "<div class='user-message'>{}</div>".format(instruction)
+    else:
+        thought_history = history["thought_history"]
+        summary_history = history["summary_history"]
+        action_history = history["action_history"]
+        summary = history["summary"]
+        action = history["action"]
+        completed_requirements = history["completed_requirements"]
+        memory = history["memory"][0]
+        insight = history["insight"]
+        error_flag = history["error_flag"]
+        user_msg = "<div class='user-message'>{}</div>".format("I have uploaded the screenshot. Please continue operating.")
+    images = get_all_files_in_folder(cache)
+    if len(images) > 0 and len(images) <= 100:
+        images = sorted(images, key=lambda x: int(x.split('/')[-1].split('.')[0]))
+        image_id = [int(image.split('/')[-1].split('.')[0]) for image in images]
+        cur_image_id = image_id[-1] + 1
+    elif len(images) > 100:
+        images = sorted(images, key=lambda x: int(x.split('/')[-1].split('.')[0]))
+        image_id = [int(image.split('/')[-1].split('.')[0]) for image in images]
+        cur_image_id = image_id[-1] + 1
+        os.remove(os.path.join(cache, str(image_id[0])+".png"))
+    else:
+        cur_image_id = 1
+    image.save(os.path.join(cache, str(cur_image_id) + ".png"), format="PNG")
+    screenshot_file = os.path.join(cache, str(cur_image_id) + ".png")
+    perception_infos, width, height = get_perception_infos(screenshot_file)
+    shutil.rmtree(temp_file)
+    os.mkdir(temp_file)
+    local_screenshot_file = encode_image(screenshot_file)
+    query_data = {
+        "task": "decision",
+        "screenshot_file": local_screenshot_file,
+        "instruction": instruction,
+        "perception_infos": perception_infos,
+        "width": width,
+        "height": height,
+        "summary_history": summary_history,
+        "action_history": action_history,
+        "summary": summary,
+        "action": action,
+        "add_info": add_info,
+        "error_flag": error_flag,
+        "completed_requirements": completed_requirements,
+        "memory": memory,
+        "memory_switch": True,
+        "insight": insight
+    }
+    response_query = mobile_agent_infer(query_data)
+    output_action = response_query["decision"]
+    output_memory = response_query["memory"]
+    if output_action == "No token":
+        bot_response = ["<div class='bot-message'>{}</div>".format("Sorry, the resources can be exhausted today.")]
+        chat_html = "<div class='chat-container'>{}</div>".format("".join(bot_response))
+        return chatbot_css + chat_html, history, chat_log
+    thought = output_action.split("### Thought ###")[-1].split("### Action ###")[0].replace("\n", " ").replace(":", "").replace("  ", " ").strip()
+    summary = output_action.split("### Operation ###")[-1].replace("\n", " ").replace("  ", " ").strip()
+    action = output_action.split("### Action ###")[-1].split("### Operation ###")[0].replace("\n", " ").replace("  ", " ").strip()
+    output_memory = output_memory.split("### Important content ###")[-1].split("\n\n")[0].strip() + "\n"
+    if "None" not in output_memory and output_memory not in memory:
+        memory += output_memory
+    if "Open app" in action:
+        bot_response = "Please click the red circle and upload the current screenshot again."
+        app_name = action.split("(")[-1].split(")")[0]
+        text, coordinate = ocr(screenshot_file, ocr_detection, ocr_recognition)
+        for ti in range(len(text)):
+            if app_name == text[ti]:
+                name_coordinate = [int((coordinate[ti][0] + coordinate[ti][2])/2), int((coordinate[ti][1] + coordinate[ti][3])/2)]
+                x, y = name_coordinate[0], name_coordinate[1]
+                radius = 75
+                draw = ImageDraw.Draw(image)
+                draw.ellipse([x - radius, y - radius, x + radius, y + radius], outline='red', width=10)
+                break
+    elif "Tap" in action:
+        bot_response = "Please click the red circle and upload the current screenshot again."
+        coordinate = action.split("(")[-1].split(")")[0].split(", ")
+        x, y = int(coordinate[0]), int(coordinate[1])
+        radius = 75
+        draw = ImageDraw.Draw(image)
+        draw.ellipse([x - radius, y - radius, x + radius, y + radius], outline='red', width=10)
+    elif "Swipe" in action:
+        bot_response = "Please slide from red circle to blue circle and upload the current screenshot again."
+        coordinate1 = action.split("Swipe (")[-1].split("), (")[0].split(", ")
+        coordinate2 = action.split("), (")[-1].split(")")[0].split(", ")
+        x1, y1 = int(coordinate1[0]), int(coordinate1[1])
+        x2, y2 = int(coordinate2[0]), int(coordinate2[1])
+        radius = 75
+        draw = ImageDraw.Draw(image)
+        draw.ellipse([x1 - radius, y1 - radius, x1 + radius, y1 + radius], outline='red', width=10)
+        draw.ellipse([x2 - radius, y2 - radius, x2 + radius, y2 + radius], outline='blue', width=10)
+    elif "Type" in action:
+        if "(text)" not in action:
+            text = action.split("(")[-1].split(")")[0]
+        else:
+            text = action.split(" \"")[-1].split("\"")[0]
+        bot_response = f"Please type the \"{text}\" and upload the current screenshot again."
+    elif "Back" in action:
+        bot_response = f"Please back to previous page and upload the current screenshot again."
+    elif "Home" in action:
+        bot_response = f"Please back to home page and upload the current screenshot again."
+    elif "Stop" in action:
+        bot_response = f"Task completed."
+    bot_text1 = "<div class='bot-message'>{}</div>".format("### Decision ###")
+    bot_thought = "<div class='bot-message'>{}</div>".format("Thought: " + thought)
+    bot_action = "<div class='bot-message'>{}</div>".format("Action: " + action)
+    bot_operation = "<div class='bot-message'>{}</div>".format("Operation: " + summary)
+    bot_text2 = "<div class='bot-message'>{}</div>".format("### Memory ###")
+    bot_memory = "<div class='bot-message'>{}</div>".format(output_memory)
+    bot_response = "<div class='bot-message'>{}</div>".format(bot_response)
+    if image is not None:
+        bot_img_html = image_to_base64(image)
+        bot_response = "<div class='bot-image'>{}</div>".format(bot_img_html) + bot_response
+    chat_log.append(user_msg)
+    thought_history.append(thought)
+    summary_history.append(summary)
+    action_history.append(action)
+    history["thought_history"] = thought_history
+    history["summary_history"] = summary_history
+    history["action_history"] = action_history
+    history["summary"] = summary
+    history["action"] = action
+    history["memory"] = memory,
+    history["memory_switch"] = True,
+    history["insight"] = insight
+    history["error_flag"] = error_flag
+    query_data = {
+        "task": "planning",
+        "instruction": instruction,
+        "thought_history": thought_history,
+        "summary_history": summary_history,
+        "action_history": action_history,
+        "completed_requirements": "",
+        "add_info": add_info
+    }
+    response_query = mobile_agent_infer(query_data)
+    output_planning = response_query["planning"]
+    if output_planning == "No token":
+        bot_response = ["<div class='bot-message'>{}</div>".format("Sorry, the resources can be exhausted today.")]
+        chat_html = "<div class='chat-container'>{}</div>".format("".join(bot_response))
+        return chatbot_css + chat_html, history, chat_log
+    output_planning = output_planning.split("### Completed contents ###")[-1].replace("\n", " ").strip()
+    history["completed_requirements"] = output_planning
+    bot_text3 = "<div class='bot-message'>{}</div>".format("### Planning ###")
+    output_planning = "<div class='bot-message'>{}</div>".format(output_planning)
+    chat_log.append(bot_text3)
+    chat_log.append(output_planning)
+    chat_log.append(bot_text1)
+    chat_log.append(bot_thought)
+    chat_log.append(bot_action)
+    chat_log.append(bot_operation)
+    chat_log.append(bot_text2)
+    chat_log.append(bot_memory)
+    chat_log.append(bot_response)
+    chat_html = "<div class='chat-container'>{}</div>".format("".join(chat_log))
+    return chatbot_css + chat_html, history, chat_log
+def lock_input(instruction):
+    return gr.update(value=instruction, interactive=False), gr.update(value=None)
+def reset_demo():
+    return gr.update(value="", interactive=True), gr.update(value="If you want to tap an icon of an app, use the action \"Open app\"", interactive=True), "<div class='chat-container'></div>", {}, []
+tos_markdown = ("""<div style="display:flex; gap: 0.25rem;" align="center">
+    <a href='https://github.com/X-PLUG/MobileAgent'><img src='https://img.shields.io/badge/Github-Code-blue'></a>
+    <a href="https://arxiv.org/abs/2406.01014"><img src="https://img.shields.io/badge/Arxiv-2406.01014-red"></a>
+    <a href='https://github.com/X-PLUG/MobileAgent/stargazers'><img src='https://img.shields.io/github/stars/X-PLUG/MobileAgent.svg?style=social'></a>
+</div>
+If you like our project, please give us a star ✨ on Github for latest update.
+**Terms of use**
+1. Input your instruction in \"Instruction\", for example \"Turn on the dark mode\".
+2. You can input helpful operation knowledge in \"Knowledge\".
+3. Click \"Submit\" to get the operation. You need to operate your mobile device according to the operation and then upload the screenshot after your operation.
+4. The 5 cases in \"Examples\" are a complete flow. Click and submit from top to bottom to experience.
+5. Due to limited resources, each operation may take a long time, please be patient and wait.
+**使用说明**
+1. 在“Instruction”中输入你的指令，例如“打开深色模式”。
+2. 你可以在“Knowledge”中输入帮助性的操作知识。
+3. 点击“Submit”来获得操作。你需要根据输出来操作手机，并且上传操作后的截图。
+4. “Example”中的5个例子是一个任务。从上到下点击它们并且点击“Submit”来体验。
+5. 由于资源有限，每次操作的时间会比较长，请耐心等待。""")
+title_markdowm = ("""# Mobile-Agent-v2: Mobile Device Operation Assistant with Effective Navigation via Multi-Agent Collaboration""")
+instruction_input = gr.Textbox(label="Instruction", placeholder="Input your instruction")
+knowledge_input = gr.Textbox(label="Knowledge", placeholder="Input your knowledge", value="If you want to tap an icon of an app, use the action \"Open app\"")
+with gr.Blocks() as demo:
+    history_state = gr.State(value={})
+    history_output = gr.State(value=[])
+    with gr.Row():
+        gr.Markdown(title_markdowm)
+    with gr.Row():
+        with gr.Column(scale=5):
+            gr.Markdown(tos_markdown)
+            with gr.Row():
+                image_input = gr.Image(label="Screenshot", type="pil", height=550, width=230)
+                gr.Examples(examples=[
+                    ["./example/example_1.jpg", "Turn on the dark mode"],
+                    ["./example/example_2.jpg", "Turn on the dark mode"],
+                    ["./example/example_3.jpg", "Turn on the dark mode"],
+                    ["./example/example_4.jpg", "Turn on the dark mode"],
+                    ["./example/example_5.jpg", "Turn on the dark mode"],
+                ], inputs=[image_input, instruction_input, knowledge_input])
+        with gr.Column(scale=6):
+            instruction_input.render()
+            knowledge_input.render()
+            with gr.Row():
+                start_button = gr.Button("Submit")
+                clear_button = gr.Button("Clear")
+            output_component = gr.HTML(label="Chat history", value="<div class='chat-container'></div>")
+    start_button.click(
+        fn=lambda image, instruction, add_info, history, output: chatbot(image, instruction, add_info, history, output),
+        inputs=[image_input, instruction_input, knowledge_input, history_state, history_output],
+        outputs=[output_component, history_state, history_output]
+    )
+    clear_button.click(
+        fn=reset_demo,
+        inputs=[],
+        outputs=[instruction_input, knowledge_input, output_component, history_state, history_output]
+    )
+demo.queue().launch(share=True)