|
import cv2 |
|
import numpy as np |
|
import IPython |
|
import os |
|
|
|
import openai |
|
import pandas as pd |
|
import json |
|
import subprocess |
|
from gensim.utils import set_gpt_model, clear_messages, format_finetune_prompt |
|
|
|
|
|
|
|
def format_completion(task_name, descriptions, code): |
|
completion_text = f" \n {task_name}: {descriptions}```\n\n###" |
|
completion_text += "\n```python\n" + code + "\n```\n\n###" |
|
return completion_text |
|
|
|
|
|
|
|
data_path = 'prompts/data' |
|
def load_offline_memory(): |
|
"""get the current task descriptions, assets, and code""" |
|
base_task_path = os.path.join(data_path, "base_tasks.json") |
|
base_asset_path = os.path.join(data_path, "base_assets.json") |
|
base_task_code_path = os.path.join(data_path, "base_task_codes.json") |
|
|
|
base_tasks = json.load(open(base_task_path)) |
|
base_assets = json.load(open(base_asset_path)) |
|
base_task_codes = json.load(open(base_task_code_path)) |
|
|
|
generated_task_path = os.path.join(data_path, "generated_tasks.json") |
|
generated_asset_path = os.path.join(data_path, "generated_assets.json") |
|
generated_task_code_path = os.path.join(data_path, "generated_task_codes.json") |
|
|
|
|
|
base_tasks.update(json.load(open(generated_task_path))) |
|
|
|
|
|
for task in json.load(open(generated_task_code_path)): |
|
if task not in base_task_codes: |
|
base_task_codes.append(task) |
|
|
|
|
|
return base_tasks, base_assets, base_task_codes |
|
|
|
|
|
code_buffer = {} |
|
base_tasks, base_assets, base_task_codes = load_offline_memory() |
|
TOTAL_DATASET_TOKENS = 0 |
|
|
|
added_tasks = [] |
|
df = pd.DataFrame() |
|
for task_file in base_task_codes: |
|
|
|
task_name = task_file[:-3].replace("_", "-") |
|
if task_name in added_tasks: |
|
continue |
|
|
|
if task_name not in base_tasks: |
|
print(f"{task_name} missing") |
|
continue |
|
|
|
added_tasks.append(task_name) |
|
task_description = base_tasks[task_name] |
|
|
|
if os.path.exists("cliport/tasks/" + task_file): |
|
task_code = open("cliport/tasks/" + task_file).read() |
|
|
|
|
|
elif os.path.exists("cliport/generated_tasks/" + task_file): |
|
task_code = open("cliport/generated_tasks/" + task_file).read() |
|
|
|
prompt = format_finetune_prompt(task_name) |
|
completion = format_completion(task_name, task_description, task_code) |
|
|
|
|
|
TOTAL_DATASET_TOKENS += len(prompt) / 4 |
|
TOTAL_DATASET_TOKENS += len(completion) / 4 |
|
new_row = { 'prompt': prompt, |
|
'completion': completion} |
|
new_row = pd.DataFrame([new_row]) |
|
df = pd.concat([df, new_row], axis=0, ignore_index=True) |
|
|
|
df.to_csv("prompts/finetune_data.csv",index=False) |
|
print("======================================") |
|
print("estimate number of tokens:", TOTAL_DATASET_TOKENS) |
|
print("estimate price for davinci:", TOTAL_DATASET_TOKENS / 1000 * 0.03) |
|
print("total number of instructions:", len(df)) |
|
print("======================================") |
|
|
|
|
|
|
|
subprocess.run('openai tools fine_tunes.prepare_data --file prompts/finetune_data.csv'.split()) |
|
|
|
print("now you can run \n openai api fine_tunes.create --training_file prompts/finetune_data_prepared.jsonl --model davinci --suffix 'GenSim'") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|