File size: 1,279 Bytes
e72aedf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
"""Prepare all datasets."""

import os


def run_cmd(cmd):
    print(cmd, flush=True)
    return os.system(cmd)


prefix = "~/datasets/sharegpt_20230521"
llama_weights = "~/model_weights/llama-7b/"

cmd_list = [
    f"python3 -m fastchat.data.clean_sharegpt --in {prefix}_html.json --out {prefix}_clean.json",
    f"python3 -m fastchat.data.optional_clean --in {prefix}_clean.json --out {prefix}_clean_lang.json --skip-lang ko",
    f"python3 -m fastchat.data.split_long_conversation --in {prefix}_clean_lang.json --out {prefix}_clean_lang_split.json --model-name {llama_weights}",
    f"python3 -m fastchat.data.filter_wrong_format --in {prefix}_clean_lang_split.json --out {prefix}_clean_lang_split.json",
    f"python3 -m fastchat.data.split_train_test --in {prefix}_clean_lang_split.json --ratio 0.99",
    f"python3 -m fastchat.data.hardcoded_questions",
    f"python3 -m fastchat.data.merge --in {prefix}_clean_lang_split_train.json hardcoded.json --out {prefix}_clean_lang_split_identity.json",
    f"python3 -m fastchat.data.extract_gpt4_only --in {prefix}_clean_lang_split_identity.json",
    f"python3 -m fastchat.data.extract_single_round --in {prefix}_clean_lang_split_identity.json",
]

for cmd in cmd_list:
    ret = run_cmd(cmd)
    if ret != 0:
        exit(ret)