File size: 4,931 Bytes
019ee78
66786b8
019ee78
 
 
 
 
 
 
 
 
 
 
 
15a824f
 
 
 
019ee78
 
 
15a824f
 
 
 
019ee78
15a824f
 
 
019ee78
 
 
 
 
 
 
e907f96
019ee78
15a824f
 
59d8204
15a824f
019ee78
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59d8204
 
 
 
 
059bf6f
 
 
 
 
 
59d8204
 
 
 
019ee78
 
9e841e4
019ee78
 
 
 
 
 
64b18b7
 
 
 
 
 
 
 
 
59d8204
 
d50fd60
 
66786b8
d50fd60
 
28873f9
66786b8
1f5a1b1
d50fd60
 
59d8204
 
 
019ee78
 
59d8204
 
 
019ee78
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15a824f
 
64b18b7
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
# -*-coding:utf-8 -*-
import re
import json
import random
import pandas as pd


class Instance(object):
    """
    By Default use few-shot for generation and evaluation
    """

    def __init__(self, loader=None):
        self.samples = loader()
        self.n_few_shot = 0
        self.n_train = 0
        self.n_eval = 0
        self.train_iter = None
        self.train_samples = []
        self.eval_samples = []

    @property
    def n_sample(self):
        return len(self.samples)

    def sample(self, n_train, n_few_shot, n_eval):
        self.n_train = n_train
        self.n_few_shot = n_few_shot
        self.n_eval = n_eval
        n_train = n_train * n_few_shot
        if n_train + n_eval > len(self.samples):
            raise ValueError(f'Train + Eval > total samples {len(self.samples)}, decrease them')

        index = random.sample(list(range(len(self.samples))), n_train + n_eval)
        train_index, eval_index = index[:n_train], index[n_train:]
        self.train_samples = [self.samples[i] for i in train_index]
        self.eval_samples = [self.samples[i] for i in eval_index]

    def get_train_iter(self):
        for i in range(self.n_train):
            yield self.train_samples[(i * self.n_few_shot):(i + 1) * self.n_few_shot]

    @staticmethod
    def display(samples):
        s = ""
        for i in samples:
            s += f'{i[0]} >> {i[1]}\n'
        return s

    @classmethod
    def from_file(cls, loader):
        return cls(loader)

    @classmethod
    def from_list(cls, tuple_list):
        # 直接输入Input,Ouput List 构建Instance
        def func():
            return tuple_list

        return cls(func)


def load_paraphase(file='./ape/data/paraphase_train.csv'):
    df = pd.read_csv(file, encoding='GBK')
    tuple_list = []
    for i in df.iterrows():
        tuple_list.append((i[1][0], i[1][1]))
    return tuple_list


def load_intent(file='./ape/data/intent_train.csv'):
    df = pd.read_csv(file, encoding='UTF8', sep='\t')
    tuple_list = []
    for i in df.iterrows():
        tuple_list.append((i[1][0], i[1][1]))
    return tuple_list


def load_qa(file='./ape/data/qa_train.json'):
    data = []
    raw_data = json.load(open(file, encoding='UTF8'))
    for i in raw_data:
        input = i['text']
        # 只取一个QA不然容易超出模型输入长度'
        output = []
        for j in i['annotations']:
            output.append(json.dumps({'问题': j["Q"], '回答': j["A"]}, ensure_ascii=False))
        output = sorted(output, key=lambda x: len(x))
        output = output[0]
        data.append((input, output))
    return data


def upload_file(file):
    tuple_list = []
    with open(file, 'r', encoding='UTF-8') as f:
        for i in f.readlines():
            input, output = i.split(' ')
            tuple_list.append((input, output))
    return tuple_list


def upload_json(file):
    tuple_list = []
    with open(file, 'r', encoding='UTF-8') as f:
        for i in f.readlines():
            data = json.loads(i.strip())
            tuple_list.append((data['input'],data['output']))
    return tuple_list


def load_entity(file='./ape/data/entity_train.json'):
    data = []
    raw_data = json.load(open(file, encoding='UTF8'))
    for i in raw_data:
        input = re.sub(r'\s{1,}', '',i['text'][:200]) # 对文本进行截断,不然太长了。。。
        output = []
        for j in i['labels']:
            ##拆分成单实体任务类型
            if j[1] =='DRUG_EFFICACY' and j[-1] in input:
                output.append(j[-1])
        output = json.dumps(output, ensure_ascii=False)
        data.append((input, output))
    return data


LoadFactory = {
    'paraphase': load_paraphase,
    'search_intent': load_intent,
    'qa_generation': load_qa,
    'entity': load_entity
}

if __name__ == '__main__':
    n_train = 2
    few_shot = 3
    n_eval = 2
    instance1 = Instance.from_file(load_paraphase)
    instance1.sample(n_train, few_shot, n_eval)
    print(instance1.display(instance1.train_samples))
    instance2 = Instance.from_list([('sane', 'insane'), ('direct', 'indirect'), ('informally', 'formally'),
                                    ('unpopular', 'popular'), ('subtractive', 'additive'),
                                    ('nonresidential', 'residential'), ('inexact', 'exact'),
                                    ('uptown', 'downtown'), ('incomparable', 'comparable'),
                                    ('powerful', 'powerless'), ('gaseous', 'solid'),
                                    ('evenly', 'unevenly'), ('formality', 'informality'),
                                    ('deliberately', 'accidentally'), ('off', 'on')])
    instance2.sample(n_train, few_shot, n_eval)
    print(instance2.display(instance2.train_samples))
    train_iter = instance2.get_train_iter()
    print(next(train_iter))


    data = upload_json('./ape/data/question_paraphrase_classification.json')