|
import json |
|
import random |
|
|
|
def split_jsonl(input_file, output_file1, output_file2, split_ratio=0.8): |
|
""" |
|
将JSONL文件按比例拆分成两个不同的JSONL文件。 |
|
|
|
:param input_file: 输入的JSONL文件路径。 |
|
:param output_file1: 输出的第一个JSONL文件路径。 |
|
:param output_file2: 输出的第二个JSONL文件路径。 |
|
:param split_ratio: 第一个输出文件的数据比例(默认0.8)。 |
|
""" |
|
|
|
with open(input_file, 'r', encoding='utf-8') as infile: |
|
lines = infile.readlines() |
|
|
|
|
|
random.shuffle(lines) |
|
|
|
|
|
split_point = int(len(lines) * split_ratio) |
|
|
|
|
|
lines1 = lines[:split_point] |
|
lines2 = lines[split_point:] |
|
|
|
|
|
with open(output_file1, 'w', encoding='utf-8') as outfile1: |
|
for line in lines1: |
|
outfile1.write(line) |
|
|
|
|
|
with open(output_file2, 'w', encoding='utf-8') as outfile2: |
|
for line in lines2: |
|
outfile2.write(line) |
|
|
|
|
|
input_file = '/home/ubuntu/model_sft/ch/pubmedqa/data/ori_pqal_dealed.jsonl' |
|
output_file1 = '/home/ubuntu/model_sft/ch/pubmedqa/data/train.jsonl' |
|
output_file2 = '/home/ubuntu/model_sft/ch/pubmedqa/data/val.jsonl' |
|
split_ratio = 0.8 |
|
|
|
split_jsonl(input_file, output_file1, output_file2, split_ratio) |
|
|
|
|