|
import cv2 |
|
import numpy as np |
|
import IPython |
|
import os |
|
|
|
import openai |
|
import pandas as pd |
|
import json |
|
import subprocess |
|
from gensim.utils import set_gpt_model, clear_messages, format_finetune_prompt, format_finetune_prompt_codeonly |
|
|
|
|
|
def format_completion_codeonly(task_name, descriptions, code): |
|
completion_text = " \n```python\n" + code + "\n```\n\nSTOP" |
|
return completion_text |
|
|
|
def format_completion(task_name, descriptions, code): |
|
completion_text = f" \n {task_name}: {descriptions}```\n\n###" |
|
completion_text += "\n```python\n" + code + "\n```\n\nSTOP" |
|
return completion_text |
|
|
|
|
|
|
|
data_path = 'prompts/data' |
|
def load_offline_memory(): |
|
"""get the current task descriptions, assets, and code""" |
|
base_task_path = os.path.join(data_path, "base_tasks.json") |
|
base_asset_path = os.path.join(data_path, "base_assets.json") |
|
base_task_code_path = os.path.join(data_path, "base_task_codes.json") |
|
|
|
base_tasks = json.load(open(base_task_path)) |
|
base_assets = json.load(open(base_asset_path)) |
|
base_task_codes = json.load(open(base_task_code_path)) |
|
|
|
generated_task_path = os.path.join(data_path, "generated_tasks.json") |
|
generated_asset_path = os.path.join(data_path, "generated_assets.json") |
|
generated_task_code_path = os.path.join(data_path, "generated_task_codes.json") |
|
|
|
|
|
base_tasks.update(json.load(open(generated_task_path))) |
|
|
|
|
|
for task in json.load(open(generated_task_code_path)): |
|
if task not in base_task_codes: |
|
base_task_codes.append(task) |
|
|
|
|
|
return base_tasks, base_assets, base_task_codes |
|
|
|
|
|
code_buffer = {} |
|
base_tasks, base_assets, base_task_codes = load_offline_memory() |
|
TOTAL_DATASET_TOKENS = 0 |
|
|
|
added_tasks = [] |
|
df = pd.DataFrame() |
|
file_name = 'prompts/finetune_data_new.jsonl' |
|
file = open(file_name, 'w') |
|
|
|
|
|
for task_file in base_task_codes: |
|
|
|
task_name = task_file[:-3].replace("_", "-") |
|
if task_name in added_tasks: |
|
continue |
|
|
|
if task_name not in base_tasks: |
|
print(f"{task_name} missing") |
|
continue |
|
|
|
added_tasks.append(task_name) |
|
task_description = base_tasks[task_name] |
|
|
|
if os.path.exists("cliport/tasks/" + task_file): |
|
task_code = open("cliport/tasks/" + task_file).read() |
|
|
|
|
|
elif os.path.exists("cliport/generated_tasks/" + task_file): |
|
task_code = open("cliport/generated_tasks/" + task_file).read() |
|
|
|
|
|
|
|
|
|
prompt = format_finetune_prompt_codeonly(task_name) |
|
completion = format_completion_codeonly(task_name, task_description, task_code) |
|
|
|
|
|
|
|
TOTAL_DATASET_TOKENS += len(prompt) / 4 |
|
TOTAL_DATASET_TOKENS += len(completion) / 4 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
data = ({"messages": [{"role": "system", "content": "You are an AI in robot simulation code and task design."}, |
|
{"role": "user", "content": prompt}, |
|
{"role": "assistant", "content": completion}]}) |
|
|
|
file.write(json.dumps(data)+"\n") |
|
|
|
|
|
print("======================================") |
|
print("estimate number of tokens:", TOTAL_DATASET_TOKENS) |
|
print("estimate price for davinci:", TOTAL_DATASET_TOKENS / 1000 * 0.03) |
|
print("total number of instructions:", len(df)) |
|
print("======================================") |
|
|
|
|
|
|
|
|
|
|
|
print("now you can run \n python misc/job_create.py") |
|
print("check file!:", file_name) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|