|
import os |
|
import logging |
|
import traceback |
|
|
|
import openai |
|
import gradio as gr |
|
import ujson as json |
|
import commentjson |
|
import openpyxl |
|
|
|
import modules.presets as presets |
|
from modules.utils import get_file_hash, count_token |
|
from modules.presets import i18n |
|
|
|
def excel_to_jsonl(filepath, preview=False): |
|
|
|
workbook = openpyxl.load_workbook(filepath) |
|
|
|
|
|
sheet = workbook.active |
|
|
|
|
|
data = [] |
|
for row in sheet.iter_rows(values_only=True): |
|
data.append(row) |
|
|
|
|
|
headers = data[0] |
|
jsonl = [] |
|
for row in data[1:]: |
|
row_data = dict(zip(headers, row)) |
|
if any(row_data.values()): |
|
jsonl.append(row_data) |
|
formatted_jsonl = [] |
|
for i in jsonl: |
|
if "提问" in i and "答案" in i: |
|
if "系统" in i : |
|
formatted_jsonl.append({ |
|
"messages":[ |
|
{"role": "system", "content": i["系统"]}, |
|
{"role": "user", "content": i["提问"]}, |
|
{"role": "assistant", "content": i["答案"]} |
|
] |
|
}) |
|
else: |
|
formatted_jsonl.append({ |
|
"messages":[ |
|
{"role": "user", "content": i["提问"]}, |
|
{"role": "assistant", "content": i["答案"]} |
|
] |
|
}) |
|
else: |
|
logging.warning(f"跳过一行数据,因为没有找到提问和答案: {i}") |
|
return formatted_jsonl |
|
|
|
def jsonl_save_to_disk(jsonl, filepath): |
|
file_hash = get_file_hash(file_paths = [filepath]) |
|
os.makedirs("files", exist_ok=True) |
|
save_path = f"files/{file_hash}.jsonl" |
|
with open(save_path, "w") as f: |
|
f.write("\n".join([json.dumps(i, ensure_ascii=False) for i in jsonl])) |
|
return save_path |
|
|
|
def estimate_cost(ds): |
|
dialogues = [] |
|
for l in ds: |
|
for m in l["messages"]: |
|
dialogues.append(m["content"]) |
|
dialogues = "\n".join(dialogues) |
|
tokens = count_token(dialogues) |
|
return f"Token 数约为 {tokens},预估每轮(epoch)费用约为 {tokens / 1000 * 0.008} 美元。" |
|
|
|
|
|
def handle_dataset_selection(file_src): |
|
logging.info(f"Loading dataset {file_src.name}...") |
|
preview = "" |
|
if file_src.name.endswith(".jsonl"): |
|
with open(file_src.name, "r") as f: |
|
ds = [json.loads(l) for l in f.readlines()] |
|
else: |
|
ds = excel_to_jsonl(file_src.name) |
|
preview = ds[0] |
|
|
|
return preview, gr.update(interactive=True), estimate_cost(ds) |
|
|
|
def upload_to_openai(file_src): |
|
openai.api_key = os.getenv("OPENAI_API_KEY") |
|
dspath = file_src.name |
|
msg = "" |
|
logging.info(f"Uploading dataset {dspath}...") |
|
if dspath.endswith(".xlsx"): |
|
jsonl = excel_to_jsonl(dspath) |
|
dspath = jsonl_save_to_disk(jsonl, dspath) |
|
try: |
|
uploaded = openai.File.create( |
|
file=open(dspath, "rb"), |
|
purpose='fine-tune' |
|
) |
|
return uploaded.id, f"上传成功" |
|
except Exception as e: |
|
traceback.print_exc() |
|
return "", f"上传失败,原因:{ e }" |
|
|
|
def build_event_description(id, status, trained_tokens, name=i18n("暂时未知")): |
|
|
|
return f""" |
|
#### 训练任务 {id} |
|
|
|
模型名称:{name} |
|
|
|
状态:{status} |
|
|
|
已经训练了 {trained_tokens} 个token |
|
""" |
|
|
|
def start_training(file_id, suffix, epochs): |
|
openai.api_key = os.getenv("OPENAI_API_KEY") |
|
try: |
|
job = openai.FineTuningJob.create(training_file=file_id, model="gpt-3.5-turbo", suffix=suffix, hyperparameters={"n_epochs": epochs}) |
|
return build_event_description(job.id, job.status, job.trained_tokens) |
|
except Exception as e: |
|
traceback.print_exc() |
|
if "is not ready" in str(e): |
|
return "训练出错,因为文件还没准备好。OpenAI 需要一点时间准备文件,过几分钟再来试试。" |
|
return f"训练失败,原因:{ e }" |
|
|
|
def get_training_status(): |
|
openai.api_key = os.getenv("OPENAI_API_KEY") |
|
active_jobs = [build_event_description(job["id"], job["status"], job["trained_tokens"], job["fine_tuned_model"]) for job in openai.FineTuningJob.list(limit=10)["data"] if job["status"] != "cancelled"] |
|
return "\n\n".join(active_jobs), gr.update(interactive=True) if len(active_jobs) > 0 else gr.update(interactive=False) |
|
|
|
def handle_dataset_clear(): |
|
return gr.update(value=None), gr.update(interactive=False) |
|
|
|
def add_to_models(): |
|
openai.api_key = os.getenv("OPENAI_API_KEY") |
|
succeeded_jobs = [job for job in openai.FineTuningJob.list()["data"] if job["status"] == "succeeded"] |
|
extra_models = [job["fine_tuned_model"] for job in succeeded_jobs] |
|
for i in extra_models: |
|
if i not in presets.MODELS: |
|
presets.MODELS.append(i) |
|
|
|
with open('config.json', 'r') as f: |
|
data = commentjson.load(f) |
|
if 'extra_models' in data: |
|
for i in extra_models: |
|
if i not in data['extra_models']: |
|
data['extra_models'].append(i) |
|
else: |
|
data['extra_models'] = extra_models |
|
with open('config.json', 'w') as f: |
|
commentjson.dump(data, f, indent=4) |
|
|
|
return gr.update(choices=presets.MODELS), f"成功添加了 {len(succeeded_jobs)} 个模型。" |
|
|
|
def cancel_all_jobs(): |
|
openai.api_key = os.getenv("OPENAI_API_KEY") |
|
jobs = [job for job in openai.FineTuningJob.list()["data"] if job["status"] not in ["cancelled", "succeeded"]] |
|
for job in jobs: |
|
openai.FineTuningJob.cancel(job["id"]) |
|
return f"成功取消了 {len(jobs)} 个训练任务。" |
|
|