Spaces:
Runtime error
Runtime error
File size: 4,931 Bytes
019ee78 66786b8 019ee78 15a824f 019ee78 15a824f 019ee78 15a824f 019ee78 e907f96 019ee78 15a824f 59d8204 15a824f 019ee78 59d8204 059bf6f 59d8204 019ee78 9e841e4 019ee78 64b18b7 59d8204 d50fd60 66786b8 d50fd60 28873f9 66786b8 1f5a1b1 d50fd60 59d8204 019ee78 59d8204 019ee78 15a824f 64b18b7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 |
# -*-coding:utf-8 -*-
import re
import json
import random
import pandas as pd
class Instance(object):
"""
By Default use few-shot for generation and evaluation
"""
def __init__(self, loader=None):
self.samples = loader()
self.n_few_shot = 0
self.n_train = 0
self.n_eval = 0
self.train_iter = None
self.train_samples = []
self.eval_samples = []
@property
def n_sample(self):
return len(self.samples)
def sample(self, n_train, n_few_shot, n_eval):
self.n_train = n_train
self.n_few_shot = n_few_shot
self.n_eval = n_eval
n_train = n_train * n_few_shot
if n_train + n_eval > len(self.samples):
raise ValueError(f'Train + Eval > total samples {len(self.samples)}, decrease them')
index = random.sample(list(range(len(self.samples))), n_train + n_eval)
train_index, eval_index = index[:n_train], index[n_train:]
self.train_samples = [self.samples[i] for i in train_index]
self.eval_samples = [self.samples[i] for i in eval_index]
def get_train_iter(self):
for i in range(self.n_train):
yield self.train_samples[(i * self.n_few_shot):(i + 1) * self.n_few_shot]
@staticmethod
def display(samples):
s = ""
for i in samples:
s += f'{i[0]} >> {i[1]}\n'
return s
@classmethod
def from_file(cls, loader):
return cls(loader)
@classmethod
def from_list(cls, tuple_list):
# 直接输入Input,Ouput List 构建Instance
def func():
return tuple_list
return cls(func)
def load_paraphase(file='./ape/data/paraphase_train.csv'):
df = pd.read_csv(file, encoding='GBK')
tuple_list = []
for i in df.iterrows():
tuple_list.append((i[1][0], i[1][1]))
return tuple_list
def load_intent(file='./ape/data/intent_train.csv'):
df = pd.read_csv(file, encoding='UTF8', sep='\t')
tuple_list = []
for i in df.iterrows():
tuple_list.append((i[1][0], i[1][1]))
return tuple_list
def load_qa(file='./ape/data/qa_train.json'):
data = []
raw_data = json.load(open(file, encoding='UTF8'))
for i in raw_data:
input = i['text']
# 只取一个QA不然容易超出模型输入长度'
output = []
for j in i['annotations']:
output.append(json.dumps({'问题': j["Q"], '回答': j["A"]}, ensure_ascii=False))
output = sorted(output, key=lambda x: len(x))
output = output[0]
data.append((input, output))
return data
def upload_file(file):
tuple_list = []
with open(file, 'r', encoding='UTF-8') as f:
for i in f.readlines():
input, output = i.split(' ')
tuple_list.append((input, output))
return tuple_list
def upload_json(file):
tuple_list = []
with open(file, 'r', encoding='UTF-8') as f:
for i in f.readlines():
data = json.loads(i.strip())
tuple_list.append((data['input'],data['output']))
return tuple_list
def load_entity(file='./ape/data/entity_train.json'):
data = []
raw_data = json.load(open(file, encoding='UTF8'))
for i in raw_data:
input = re.sub(r'\s{1,}', '',i['text'][:200]) # 对文本进行截断,不然太长了。。。
output = []
for j in i['labels']:
##拆分成单实体任务类型
if j[1] =='DRUG_EFFICACY' and j[-1] in input:
output.append(j[-1])
output = json.dumps(output, ensure_ascii=False)
data.append((input, output))
return data
LoadFactory = {
'paraphase': load_paraphase,
'search_intent': load_intent,
'qa_generation': load_qa,
'entity': load_entity
}
if __name__ == '__main__':
n_train = 2
few_shot = 3
n_eval = 2
instance1 = Instance.from_file(load_paraphase)
instance1.sample(n_train, few_shot, n_eval)
print(instance1.display(instance1.train_samples))
instance2 = Instance.from_list([('sane', 'insane'), ('direct', 'indirect'), ('informally', 'formally'),
('unpopular', 'popular'), ('subtractive', 'additive'),
('nonresidential', 'residential'), ('inexact', 'exact'),
('uptown', 'downtown'), ('incomparable', 'comparable'),
('powerful', 'powerless'), ('gaseous', 'solid'),
('evenly', 'unevenly'), ('formality', 'informality'),
('deliberately', 'accidentally'), ('off', 'on')])
instance2.sample(n_train, few_shot, n_eval)
print(instance2.display(instance2.train_samples))
train_iter = instance2.get_train_iter()
print(next(train_iter))
data = upload_json('./ape/data/question_paraphrase_classification.json') |