Spaces:
Runtime error
Runtime error
"""Prepare all datasets.""" | |
import os | |
def run_cmd(cmd): | |
print(cmd, flush=True) | |
return os.system(cmd) | |
prefix = "~/datasets/sharegpt_20230521" | |
llama_weights = "~/model_weights/llama-7b/" | |
cmd_list = [ | |
f"python3 -m fastchat.data.clean_sharegpt --in {prefix}_html.json --out {prefix}_clean.json", | |
f"python3 -m fastchat.data.optional_clean --in {prefix}_clean.json --out {prefix}_clean_lang.json --skip-lang ko", | |
f"python3 -m fastchat.data.split_long_conversation --in {prefix}_clean_lang.json --out {prefix}_clean_lang_split.json --model-name {llama_weights}", | |
f"python3 -m fastchat.data.filter_wrong_format --in {prefix}_clean_lang_split.json --out {prefix}_clean_lang_split.json", | |
f"python3 -m fastchat.data.split_train_test --in {prefix}_clean_lang_split.json --ratio 0.99", | |
f"python3 -m fastchat.data.hardcoded_questions", | |
f"python3 -m fastchat.data.merge --in {prefix}_clean_lang_split_train.json hardcoded.json --out {prefix}_clean_lang_split_identity.json", | |
f"python3 -m fastchat.data.extract_gpt4_only --in {prefix}_clean_lang_split_identity.json", | |
f"python3 -m fastchat.data.extract_single_round --in {prefix}_clean_lang_split_identity.json", | |
] | |
for cmd in cmd_list: | |
ret = run_cmd(cmd) | |
if ret != 0: | |
exit(ret) | |