|
import os |
|
import base64 |
|
from io import BytesIO |
|
from PIL import Image |
|
|
|
from MobileAgent.api import inference_chat |
|
from MobileAgent.prompt_no_input import get_action_prompt, get_reflect_prompt, get_memory_prompt, get_process_prompt |
|
from MobileAgent.chat import init_action_chat, init_reflect_chat, init_memory_chat, add_response, add_response_two_image |
|
|
|
from dashscope import MultiModalConversation |
|
import dashscope |
|
import concurrent |
|
|
|
|
|
API_url = os.environ.get('url') |
|
token = os.environ.get('token') |
|
|
|
|
|
def base64_to_pil(base64_string): |
|
if base64_string.startswith('data:image'): |
|
base64_string = base64_string.split(',')[-1] |
|
image_data = base64.b64decode(base64_string) |
|
image_stream = BytesIO(image_data) |
|
pil_image = Image.open(image_stream) |
|
return pil_image |
|
|
|
|
|
def process_image(image, query): |
|
dashscope.api_key = os.environ.get('qwen') |
|
image = "file://" + image |
|
messages = [{ |
|
'role': 'user', |
|
'content': [ |
|
{ |
|
'image': image |
|
}, |
|
{ |
|
'text': query |
|
}, |
|
] |
|
}] |
|
response = MultiModalConversation.call(model="qwen-vl-plus", messages=messages) |
|
|
|
try: |
|
response = response['output']['choices'][0]['message']['content'][0]["text"] |
|
except: |
|
response = "This is an icon." |
|
|
|
return response |
|
|
|
|
|
if not os.path.exists("screenshot"): |
|
os.mkdir("screenshot") |
|
if not os.path.exists("temp"): |
|
os.mkdir("temp") |
|
|
|
|
|
def mobile_agent_infer(json_data): |
|
task = json_data["task"] |
|
if task == "caption": |
|
query = json_data["query"] |
|
images = json_data["images"] |
|
local_images = [] |
|
for image in images: |
|
image_name = image["image_name"] |
|
image_file = image["image_file"] |
|
image_file = base64_to_pil(image_file) |
|
image_path = "temp/" + image_name |
|
image_file.save(image_path, "PNG") |
|
local_images.append(image_path) |
|
|
|
icon_map = {} |
|
with concurrent.futures.ThreadPoolExecutor() as executor: |
|
futures = {executor.submit(process_image, image, query): i for i, image in enumerate(local_images)} |
|
|
|
for future in concurrent.futures.as_completed(futures): |
|
i = futures[future] |
|
response = future.result() |
|
icon_map[i + 1] = response |
|
|
|
output = {"icon_map": icon_map} |
|
return output |
|
|
|
elif task == "planning": |
|
instruction = json_data["instruction"] |
|
thought_history = json_data["thought_history"] |
|
summary_history = json_data["summary_history"] |
|
action_history = json_data["action_history"] |
|
completed_requirements = json_data["completed_requirements"] |
|
add_info = json_data["add_info"] |
|
|
|
prompt_planning = get_process_prompt(instruction, thought_history, summary_history, action_history, |
|
completed_requirements, add_info) |
|
chat_planning = init_memory_chat() |
|
chat_planning = add_response("user", prompt_planning, chat_planning) |
|
output_planning = inference_chat(chat_planning, 'gpt-4-turbo', API_url, token) |
|
|
|
output = {"planning": output_planning} |
|
return output |
|
|
|
elif task == "decision": |
|
screenshot_file = json_data["screenshot_file"] |
|
screenshot_file = base64_to_pil(screenshot_file) |
|
image_path = "screenshot/screenshot_local.png" |
|
screenshot_file.save(image_path, "PNG") |
|
|
|
instruction = json_data["instruction"] |
|
perception_infos = json_data["perception_infos"] |
|
width = json_data["width"] |
|
height = json_data["height"] |
|
summary_history = json_data["summary_history"] |
|
action_history = json_data["action_history"] |
|
summary = json_data["summary"] |
|
action = json_data["action"] |
|
add_info = json_data["add_info"] |
|
error_flag = json_data["error_flag"] |
|
completed_requirements = json_data["completed_requirements"] |
|
memory = json_data["memory"] |
|
memory_switch = json_data["memory_switch"] |
|
insight = json_data["insight"] |
|
|
|
prompt_action = get_action_prompt(instruction, perception_infos, width, height, summary_history, |
|
action_history, summary, action, add_info, error_flag, completed_requirements, |
|
memory) |
|
chat_action = init_action_chat() |
|
chat_action = add_response("user", prompt_action, chat_action, image_path) |
|
output_action = inference_chat(chat_action, 'gpt-4o', API_url, token) |
|
if output_action == "No token": |
|
output = {"decision": "No token", "memory": None} |
|
return output |
|
chat_action = add_response("assistant", output_action, chat_action) |
|
|
|
output_memory = None |
|
if memory_switch: |
|
prompt_memory = get_memory_prompt(insight) |
|
chat_action = add_response("user", prompt_memory, chat_action) |
|
output_memory = inference_chat(chat_action, 'gpt-4o', API_url, token) |
|
|
|
output = {"decision": output_action, "memory": output_memory} |
|
return output |
|
|
|
elif task == "reflection": |
|
screenshot_file = json_data["screenshot_file"] |
|
screenshot_file = base64_to_pil(screenshot_file) |
|
image_path = "screenshot/screenshot_local.png" |
|
screenshot_file.save(image_path, "PNG") |
|
last_screenshot_file = json_data["last_screenshot_file"] |
|
last_screenshot_file = base64_to_pil(last_screenshot_file) |
|
last_image_path = "screenshot/last_screenshot_local.png" |
|
last_screenshot_file.save(last_image_path, "PNG") |
|
|
|
instruction = json_data["instruction"] |
|
last_perception_infos = json_data["last_perception_infos"] |
|
perception_infos = json_data["perception_infos"] |
|
width = json_data["width"] |
|
height = json_data["height"] |
|
summary = json_data["summary"] |
|
action = json_data["action"] |
|
add_info = json_data["add_info"] |
|
|
|
prompt_reflect = get_reflect_prompt(instruction, last_perception_infos, perception_infos, width, height, |
|
summary, action, add_info) |
|
chat_reflect = init_reflect_chat() |
|
chat_reflect = add_response_two_image("user", prompt_reflect, chat_reflect, [last_image_path, image_path]) |
|
output_reflect = inference_chat(chat_reflect, 'gpt-4o', API_url, token) |
|
|
|
output = {"reflection": output_reflect} |
|
return output |
|
|
|
else: |
|
output = {"error": "The task must be in \"caption\", \"planning\", \"decision\" and \"reflection\"."} |
|
return output |