Spaces:
Sleeping
Sleeping
File size: 2,478 Bytes
ecca75f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 |
# coding=utf-8
import sys
sys.path.append("../")
from collections import defaultdict
from .utils import is_float, load_txt
import random
random.seed(1234)
class CreateDataset:
def __init__(self, max_input_len=1500):
self.prompt = load_txt("../prompt/dataset_character.txt")
self.max_input_len = max_input_len # 小于(seq-length)-(max-gen-length)
self.example_split_flag = f"\n{'-' * 20}\n"
self.dataset = defaultdict(list)
self.manual_dataset = []
@staticmethod
def choose_examples(similar_examples,
max_length,
train_flag=False,
dialog=None,
example_split_flag=f"\n{'-' * 20}\n"):
if isinstance(similar_examples, str):
new_similar_examples = [x.strip() for x in similar_examples.split(example_split_flag)]
else:
# 去重
new_similar_examples = []
for example in similar_examples:
if (isinstance(example, list) or isinstance(example, tuple)) and len(example) == 2 and is_float(
example[0]):
# 包含score
example = example[1]
try:
example = "\n".join(example).strip()
except TypeError:
raise TypeError(f"example: {example}")
if train_flag and dialog and (example in dialog or dialog in example):
continue
# example去重
if train_flag:
# 部分相似也去掉
flag = False
for n_example in new_similar_examples:
if example in n_example or n_example in example:
flag = True
break
if not flag:
new_similar_examples.append(example)
else:
if example not in new_similar_examples:
new_similar_examples.append(example)
results = []
total_length = 0
for example in new_similar_examples:
total_length += len(example) if not total_length else len(example_split_flag) + len(example)
if total_length > max_length:
break
results.append(example)
results = example_split_flag.join(results).strip()
return results
|