def get_action_prompt(instruction, clickable_infos, width, height, summary_history, action_history, last_summary, last_action, add_info, error_flag, completed_content, memory): prompt = "### Background ###\n" prompt += f"This image is a phone screenshot. Its width is {width} pixels and its height is {height} pixels. The user\'s instruction is: {instruction}.\n\n" prompt += "### Screenshot information ###\n" prompt += "In order to help you better perceive the content in this screenshot, we extract some information on the current screenshot through system files. " prompt += "This information consists of two parts: coordinates; content. " prompt += "The format of the coordinates is [x, y], x is the pixel from left to right and y is the pixel from top to bottom; the content is a text or an icon description respectively. " prompt += "The information is as follow:\n" for clickable_info in clickable_infos: if clickable_info['text'] != "" and clickable_info['text'] != "icon: None" and clickable_info['coordinates'] != (0, 0): prompt += f"{clickable_info['coordinates']}; {clickable_info['text']}\n" prompt += "Please note that this information is not necessarily accurate. You need to combine the screenshot to understand." prompt += "\n\n" if add_info != "": prompt += "### Hint ###\n" prompt += "There are hints to help you complete the user\'s instructions. The hints are as follow:\n" prompt += add_info prompt += "\n\n" if len(action_history) > 0: prompt += "### History operations ###\n" prompt += "Before reaching this page, some operations have been completed. You need to refer to the completed operations to decide the next operation. These operations are as follow:\n" for i in range(len(action_history)): prompt += f"Step-{i+1}: [Operation: " + summary_history[i].split(" to ")[0].strip() + "; Action: " + action_history[i] + "]\n" prompt += "\n" if completed_content != "": prompt += "### Progress ###\n" prompt += "After completing the history operations, you have the following thoughts about the progress of user\'s instruction completion:\n" prompt += "Completed contents:\n" + completed_content + "\n\n" if memory != "": prompt += "### Memory ###\n" prompt += "During the operations, you record the following contents on the screenshot for use in subsequent operations:\n" prompt += "Memory:\n" + memory + "\n" if error_flag: prompt += "### Last operation ###\n" prompt += f"You previously wanted to perform the operation \"{last_summary}\" on this page and executed the Action \"{last_action}\". But you find that this operation does not meet your expectation. You need to reflect and revise your operation this time." prompt += "\n\n" prompt += "### Response requirements ###\n" prompt += "Now you need to combine all of the above to perform just one action on the current page. You must choose one of the six actions below:\n" prompt += "Open app (app name): If the current page is desktop, you can use this action to open the app named \"app name\" on the desktop.\n" prompt += "Tap (x, y): Tap the position (x, y) in current page.\n" prompt += "Swipe (x1, y1), (x2, y2): Swipe from position (x1, y1) to position (x2, y2).\n" prompt += "Type (text): Type the \"text\" in the input box.\n" prompt += "Home: Return to home page.\n" prompt += "Stop: If you think all the requirements of user\'s instruction have been completed and no further operation is required, you can choose this action to terminate the operation process." prompt += "\n\n" prompt += "### Output format ###\n" prompt += "Your output consists of the following three parts:\n" prompt += "### Thought ###\nThink about the requirements that have been completed in previous operations and the requirements that need to be completed in the next one operation.\n" prompt += "### Action ###\nYou can only choose one from the six actions above. Make sure that the coordinates or text in the \"()\".\n" prompt += "### Operation ###\nPlease generate a brief natural language description for the operation in Action based on your Thought." return prompt def get_reflect_prompt(instruction, clickable_infos1, clickable_infos2, width, height, summary, action, add_info): prompt = f"These images are two phone screenshots before and after an operation. Their widths are {width} pixels and their heights are {height} pixels.\n\n" prompt += "In order to help you better perceive the content in this screenshot, we extract some information on the current screenshot through system files. " prompt += "The information consists of two parts, consisting of format: coordinates; content. " prompt += "The format of the coordinates is [x, y], x is the pixel from left to right and y is the pixel from top to bottom; the content is a text or an icon description respectively " prompt += "The keyboard status is whether the keyboard of the current page is activated." prompt += "\n\n" prompt += "### Before the current operation ###\n" prompt += "Screenshot information:\n" for clickable_info in clickable_infos1: if clickable_info['text'] != "" and clickable_info['text'] != "icon: None" and clickable_info['coordinates'] != (0, 0): prompt += f"{clickable_info['coordinates']}; {clickable_info['text']}\n" prompt += "\n" prompt += "### After the current operation ###\n" prompt += "Screenshot information:\n" for clickable_info in clickable_infos2: if clickable_info['text'] != "" and clickable_info['text'] != "icon: None" and clickable_info['coordinates'] != (0, 0): prompt += f"{clickable_info['coordinates']}; {clickable_info['text']}\n" prompt += "\n" prompt += "### Current operation ###\n" prompt += f"The user\'s instruction is: {instruction}. You also need to note the following requirements: {add_info}. In the process of completing the requirements of instruction, an operation is performed on the phone. Below are the details of this operation:\n" prompt += "Operation thought: " + summary.split(" to ")[0].strip() + "\n" prompt += "Operation action: " + action prompt += "\n\n" prompt += "### Response requirements ###\n" prompt += "Now you need to output the following content based on the screenshots before and after the current operation:\n" prompt += "Whether the result of the \"Operation action\" meets your expectation of \"Operation thought\"?\n" prompt += "A: The result of the \"Operation action\" meets my expectation of \"Operation thought\".\n" prompt += "B: The \"Operation action\" results in a wrong page and I need to return to the previous page.\n" prompt += "C: The \"Operation action\" produces no changes." prompt += "\n\n" prompt += "### Output format ###\n" prompt += "Your output format is:\n" prompt += "### Thought ###\nYour thought about the question\n" prompt += "### Answer ###\nA or B or C" return prompt def get_memory_prompt(insight): if insight != "": prompt = "### Important content ###\n" prompt += insight prompt += "\n\n" prompt += "### Response requirements ###\n" prompt += "Please think about whether there is any content closely related to ### Important content ### on the current page? If there is, please output the content. If not, please output \"None\".\n\n" else: prompt = "### Response requirements ###\n" prompt += "Please think about whether there is any content closely related to user\'s instrcution on the current page? If there is, please output the content. If not, please output \"None\".\n\n" prompt += "### Output format ###\n" prompt += "Your output format is:\n" prompt += "### Important content ###\nThe content or None. Please do not repeatedly output the information in ### Memory ###." return prompt def get_process_prompt(instruction, thought_history, summary_history, action_history, completed_content, add_info): prompt = "### Background ###\n" prompt += f"There is an user\'s instruction which is: {instruction}. You are a mobile phone operating assistant and are operating the user\'s mobile phone.\n\n" if add_info != "": prompt += "### Hint ###\n" prompt += "There are hints to help you complete the user\'s instructions. The hints are as follow:\n" prompt += add_info prompt += "\n\n" if len(thought_history) > 1: prompt += "### History operations ###\n" prompt += "To complete the requirements of user\'s instruction, you have performed a series of operations. These operations are as follow:\n" for i in range(len(summary_history)): operation = summary_history[i].split(" to ")[0].strip() prompt += f"Step-{i+1}: [Operation thought: " + operation + "; Operation action: " + action_history[i] + "]\n" prompt += "\n" prompt += "### Progress thinking ###\n" prompt += "After completing the history operations, you have the following thoughts about the progress of user\'s instruction completion:\n" prompt += "Completed contents:\n" + completed_content + "\n\n" prompt += "### Response requirements ###\n" prompt += "Now you need to update the \"Completed contents\". Completed contents is a general summary of the current contents that have been completed based on the ### History operations ###.\n\n" prompt += "### Output format ###\n" prompt += "Your output format is:\n" prompt += "### Completed contents ###\nUpdated Completed contents. Don\'t output the purpose of any operation. Just summarize the contents that have been actually completed in the ### History operations ###." else: prompt += "### Current operation ###\n" prompt += "To complete the requirements of user\'s instruction, you have performed an operation. Your operation thought and action of this operation are as follows:\n" prompt += f"Operation thought: {thought_history[-1]}\n" operation = summary_history[-1].split(" to ")[0].strip() prompt += f"Operation action: {operation}\n\n" prompt += "### Response requirements ###\n" prompt += "Now you need to combine all of the above to generate the \"Completed contents\".\n" prompt += "Completed contents is a general summary of the current contents that have been completed. You need to first focus on the requirements of user\'s instruction, and then summarize the contents that have been completed.\n\n" prompt += "### Output format ###\n" prompt += "Your output format is:\n" prompt += "### Completed contents ###\nGenerated Completed contents. Don\'t output the purpose of any operation. Just summarize the contents that have been actually completed in the ### Current operation ###.\n" prompt += "(Please use English to output)" return prompt