Mobile-Agent / MobileAgent /local_server.py
阳渠
Mobile-Agent-v2
1e96bca
raw
history blame
6.66 kB
import os
import base64
from io import BytesIO
from PIL import Image
from MobileAgent.api import inference_chat
from MobileAgent.prompt_no_input import get_action_prompt, get_reflect_prompt, get_memory_prompt, get_process_prompt
from MobileAgent.chat import init_action_chat, init_reflect_chat, init_memory_chat, add_response, add_response_two_image
from dashscope import MultiModalConversation
import dashscope
import concurrent
API_url = os.environ.get('url')
token = os.environ.get('token')
def base64_to_pil(base64_string):
if base64_string.startswith('data:image'):
base64_string = base64_string.split(',')[-1]
image_data = base64.b64decode(base64_string)
image_stream = BytesIO(image_data)
pil_image = Image.open(image_stream)
return pil_image
def process_image(image, query):
dashscope.api_key = os.environ.get('qwen')
image = "file://" + image
messages = [{
'role': 'user',
'content': [
{
'image': image
},
{
'text': query
},
]
}]
response = MultiModalConversation.call(model="qwen-vl-plus", messages=messages)
try:
response = response['output']['choices'][0]['message']['content'][0]["text"]
except:
response = "This is an icon."
return response
if not os.path.exists("screenshot"):
os.mkdir("screenshot")
if not os.path.exists("temp"):
os.mkdir("temp")
def mobile_agent_infer(json_data):
task = json_data["task"]
if task == "caption":
query = json_data["query"]
images = json_data["images"]
local_images = []
for image in images:
image_name = image["image_name"]
image_file = image["image_file"]
image_file = base64_to_pil(image_file)
image_path = "temp/" + image_name
image_file.save(image_path, "PNG")
local_images.append(image_path)
icon_map = {}
with concurrent.futures.ThreadPoolExecutor() as executor:
futures = {executor.submit(process_image, image, query): i for i, image in enumerate(local_images)}
for future in concurrent.futures.as_completed(futures):
i = futures[future]
response = future.result()
icon_map[i + 1] = response
output = {"icon_map": icon_map}
return output
elif task == "planning":
instruction = json_data["instruction"]
thought_history = json_data["thought_history"]
summary_history = json_data["summary_history"]
action_history = json_data["action_history"]
completed_requirements = json_data["completed_requirements"]
add_info = json_data["add_info"]
prompt_planning = get_process_prompt(instruction, thought_history, summary_history, action_history,
completed_requirements, add_info)
chat_planning = init_memory_chat()
chat_planning = add_response("user", prompt_planning, chat_planning)
output_planning = inference_chat(chat_planning, 'gpt-4-turbo', API_url, token)
output = {"planning": output_planning}
return output
elif task == "decision":
screenshot_file = json_data["screenshot_file"]
screenshot_file = base64_to_pil(screenshot_file)
image_path = "screenshot/screenshot_local.png"
screenshot_file.save(image_path, "PNG")
instruction = json_data["instruction"]
perception_infos = json_data["perception_infos"]
width = json_data["width"]
height = json_data["height"]
summary_history = json_data["summary_history"]
action_history = json_data["action_history"]
summary = json_data["summary"]
action = json_data["action"]
add_info = json_data["add_info"]
error_flag = json_data["error_flag"]
completed_requirements = json_data["completed_requirements"]
memory = json_data["memory"]
memory_switch = json_data["memory_switch"]
insight = json_data["insight"]
prompt_action = get_action_prompt(instruction, perception_infos, width, height, summary_history,
action_history, summary, action, add_info, error_flag, completed_requirements,
memory)
chat_action = init_action_chat()
chat_action = add_response("user", prompt_action, chat_action, image_path)
output_action = inference_chat(chat_action, 'gpt-4o', API_url, token)
if output_action == "No token":
output = {"decision": "No token", "memory": None}
return output
chat_action = add_response("assistant", output_action, chat_action)
output_memory = None
if memory_switch:
prompt_memory = get_memory_prompt(insight)
chat_action = add_response("user", prompt_memory, chat_action)
output_memory = inference_chat(chat_action, 'gpt-4o', API_url, token)
output = {"decision": output_action, "memory": output_memory}
return output
elif task == "reflection":
screenshot_file = json_data["screenshot_file"]
screenshot_file = base64_to_pil(screenshot_file)
image_path = "screenshot/screenshot_local.png"
screenshot_file.save(image_path, "PNG")
last_screenshot_file = json_data["last_screenshot_file"]
last_screenshot_file = base64_to_pil(last_screenshot_file)
last_image_path = "screenshot/last_screenshot_local.png"
last_screenshot_file.save(last_image_path, "PNG")
instruction = json_data["instruction"]
last_perception_infos = json_data["last_perception_infos"]
perception_infos = json_data["perception_infos"]
width = json_data["width"]
height = json_data["height"]
summary = json_data["summary"]
action = json_data["action"]
add_info = json_data["add_info"]
prompt_reflect = get_reflect_prompt(instruction, last_perception_infos, perception_infos, width, height,
summary, action, add_info)
chat_reflect = init_reflect_chat()
chat_reflect = add_response_two_image("user", prompt_reflect, chat_reflect, [last_image_path, image_path])
output_reflect = inference_chat(chat_reflect, 'gpt-4o', API_url, token)
output = {"reflection": output_reflect}
return output
else:
output = {"error": "The task must be in \"caption\", \"planning\", \"decision\" and \"reflection\"."}
return output