File size: 1,346 Bytes
c700ce7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
import csv
import random
import json
import numpy as np

from sklearn.model_selection import ShuffleSplit

samples = {
            "x": [],
            "y": [],
        }
little = False
all_loaded_sample = 500000
# 二十万条
with open("./data/prompts.csv") as f:
    csv_reader = csv.DictReader(f)
    for row_number, row in enumerate(csv_reader):
        # if row_number == random.randint(0, 1000):
        #     break
        if little:
            if row_number > 100:
                break
        if row_number > all_loaded_sample:
            break
        
        datum = row
        modifiers = json.loads(datum['raw_data'])['modifiers']
        n = random.randint(1, 11)
        if len(modifiers) < 3:
            continue
        label = ",".join(modifiers) if len(modifiers) > 1 else modifiers[0]
        if 0<n and n<=6:
            x = modifiers[0]
        elif n>6 and n<=9:
            x = ",".join(modifiers[:2])
        else:
            x = ",".join(modifiers[:3])
        # 小文本到大文本,因此x更小,同时x按照6:3:1的比例分配

        samples["x"].append(x)
        samples["y"].append(label)
        
        
with open("./data/dataset_openprompt.json", "w") as f:
    json.dump(samples, f, indent=4, ensure_ascii=False)
    print("*"*40, "save train done.", "with little" if little else "", "*"*40)