| | import json |
| | import random |
| |
|
| | |
| | input_filepath = 'commit_data_hpc_modified.jsonl' |
| | train_filepath = 'train.jsonl' |
| | valid_filepath = 'valid.jsonl' |
| | test_filepath = 'test.jsonl' |
| |
|
| | |
| | data_0 = [] |
| | data_1 = [] |
| | with open(input_filepath, 'r') as infile: |
| | for line in infile: |
| | entry = json.loads(line.strip()) |
| | if entry['target'] == 0: |
| | data_0.append(entry) |
| | elif entry['target'] == 1: |
| | data_1.append(entry) |
| |
|
| | |
| | random.shuffle(data_0) |
| | random.shuffle(data_1) |
| |
|
| | |
| | train_ratio, valid_ratio = 0.8, 0.1 |
| |
|
| | train_data = data_0[:int(len(data_0)*train_ratio)] + data_1[:int(len(data_1)*train_ratio)] |
| | valid_data = data_0[int(len(data_0)*train_ratio):int(len(data_0)*(train_ratio+valid_ratio))] + \ |
| | data_1[int(len(data_1)*train_ratio):int(len(data_1)*(train_ratio+valid_ratio))] |
| | test_data = data_0[int(len(data_0)*(train_ratio+valid_ratio)):] + data_1[int(len(data_1)*(train_ratio+valid_ratio)):] |
| |
|
| | |
| | with open(train_filepath, 'w') as f: |
| | for entry in train_data: |
| | f.write(json.dumps(entry) + '\n') |
| |
|
| | with open(valid_filepath, 'w') as f: |
| | for entry in valid_data: |
| | f.write(json.dumps(entry) + '\n') |
| |
|
| | with open(test_filepath, 'w') as f: |
| | for entry in test_data: |
| | f.write(json.dumps(entry) + '\n') |
| |
|
| | print("File splitting complete!") |
| |
|