import os |
import json |
os.environ["CUDA_VISIBLE_DEVICES"] = "1" |
def read_json(file_path): |
with open(file_path, 'r', encoding='utf-8') as file: |
data = json.load(file) |
return data |
def write_json(file_path, data): |
with open(file_path, 'w', encoding='utf-8') as file: |
json.dump(data, file, ensure_ascii=False, indent=4) |
import os |
from openai import OpenAI |
import pprint |
import json |
from llamaapi import LlamaAPI |
llama = LlamaAPI("LL-SmrO4FiBWvkfaGskA4fe6qLSVa7Ob5B83jOojHNq8HkrukjRRG4Xt3CF1mLV9u6o") |
os.environ["OPENAI_API_KEY"] = "sk-proj-Jmlrkk0HauWRhffybWOKT3BlbkFJIIuX6dFVCyVG7y6lGwsh" |
from chat import MiniCPMVChat, img2base64 |
import torch |
import json |
from PIL import Image |
torch.manual_seed(0) |
chat_model = MiniCPMVChat('/code/ICLR_2024/Model/MiniCPM-Llama3-V-2_5') |
image_path = '/code/ICLR_2024/SeeClick/output_image_27.png' |
qs = """ |
List all the application name and location in the image that can be interacted with, the result shoudl be like a list |
""" |
im_64 = img2base64(image_path) |
msgs = [{"role": "user", "content": qs}] |
inputs = {"image": im_64, "question": json.dumps(msgs)} |
answer = chat_model.chat(inputs) |
data = read_json("/code/ICLR_2024/Auto-GUI/dataset/blip/single_blip_train_llava_10000_caption_elements_llama3_70b.json") |
retrival_dict = {} |
for index, i in enumerate(data): |
retrival_dict[i['image']] = index |
path = '/code/ICLR_2024/Auto-GUI/dataset/' |
image_id = [ x['image'].split('/')[2].split('.')[0] for x in data] |
all_pair_id = {} |
all_pair_key = [] |
for i in image_id: |
key = i.split('_')[0] |
all_pair_id[key] = [] |
all_pair_key.append(key) |
for i in image_id: |
key = i.split('_')[0] |
value = i.split('_')[1] |
all_pair_id[key].append(value) |
all_pair_key = list(set(all_pair_key)) |
path2 = 'blip/single_texts_splits/' |
from tqdm import tqdm |
for i in tqdm(all_pair_key[770:]): |
num_list = all_pair_id[i] |
for j in num_list: |
retival_path = path2 + i + '_' + j + '.png' |
new_path = path + path2 + i + '_' + j + '.png' |
ids = retrival_dict[retival_path] |
image_path = path + data[ids]['image'] |
caption = data[ids]['caption'] |
Previous = data[ids]['conversations'][0]['value'] |
Previous = Previous.lower() |
task = Previous.split('goal')[1] |
Demo_prompt_step1 = """ |
List all the application name and location in the image that can be interacted with, the result shoudl be like a list |
""" |
im_64 = img2base64(image_path) |
msgs = [{"role": "user", "content": Demo_prompt_step1}] |
inputs = {"image": im_64, "question": json.dumps(msgs)} |
answer = chat_model.chat(inputs) |
data[ids]['icon_list_raw'] = answer |
pprint.pprint(answer) |
prompt = """ ##### refine it to a list, list name must be elements , just like: |
elements = [ |
"Newegg", |
"Newegg CEO", |
"Newegg customer service", |
"Newegg founder", |
"Newegg promo code", |
"Newegg return policy", |
"Newegg revenue", |
"Newegg military discounts"] |
Answer the python list only! |
##### """ |
import time |
time.sleep(2) |
api_request_json = { |
"model": "llama3-70b", |
"messages": [ |
{"role": "system", "content": "You are a assistant that will handle the corresponding text formatting for me."}, |
{"role": "user", "content": answer + prompt}, |
], |
"max_tokens": 1024 |
} |
try: |
response = llama.run(api_request_json) |
new_answer = response.json()['choices'][0]['message']['content'] |
print('======================================================') |
pprint.pprint(new_answer) |
print('======================================================') |
except Exception as e: |
print(f"Error in LLAMA API Generation : {e}") |
import time |
time.sleep(30) |
continue |
try: |
exec(new_answer) |
data[ids]['icon_list'] = elements |
except Exception as e: |
print(f"Error in setting data[ids]['icon_list']: {e}") |
continue |
write_json('/code/ICLR_2024/Auto-GUI/dataset/blip/single_blip_train_llava_10000_caption_elements_llama3_70b.json',data) |