DSXiangLi commited on
Commit
019ee78
1 Parent(s): f4af8ee
ape/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ # -*-coding:utf-8 -*-
ape/__pycache__/__init__.cpython-38.pyc ADDED
Binary file (125 Bytes). View file
 
ape/__pycache__/prompt.cpython-38.pyc ADDED
Binary file (931 Bytes). View file
 
ape/ape.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*-coding:utf-8 -*-
2
+
3
+ from ape.instance import Instance, LoadFactory
4
+ from functools import partial
5
+
6
+
7
+ def load_task(task, file):
8
+ global instance
9
+ if task:
10
+ loader = LoadFactory[task]
11
+ print(loader)
12
+ else:
13
+ print(file)
14
+ loader = partial(LoadFactory['upload'], file=[i.name for i in file][0])
15
+ instance = Instance.from_file(loader)
16
+ print(instance.samples[0])
17
+ return instance
18
+
19
+
20
+ def sample_data(instance, n_train, n_few_shot, n_eval):
21
+ instance.sample(n_train, n_few_shot, n_eval)
22
+ train_str = instance.display(instance.train_samples)
23
+ eval_str = instance.display(instance.eval_samples)
24
+ return train_str, eval_str, instance
25
+
ape/data/README.md ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ ## 医疗指令集数据来源
2
+
3
+ 1. 文本分类:非标准化疾病诉求数据集 https://aistudio.baidu.com/aistudio/datasetdetail/104082
4
+ 2. 意图理解:医疗搜索意图识别挑战赛 https://aistudio.baidu.com/aistudio/datasetdetail/166530
5
+ 3. 问题生成:中医文献问题生成数据集 https://tianchi.aliyun.com/dataset/86895
6
+ 4. 实体抽取:中文医学文本实体关系抽取 https://www.biendata.xyz/competition/chip_2020_2/data/
7
+ 5. 信息抽取:面向中文电子病历的医疗实体及事件抽取 https://www.biendata.xyz/competition/ccks_2021_clinic/data/
8
+ 6. Paraphase:临床术语标准化任务http://openkg.cn/dataset/yidu-n7k
9
+ 7. QA:新冠知识图谱构建与问答评测https://www.biendata.xyz/competition/ccks_2020_7_4/data/
10
+
ape/data/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ # -*-coding:utf-8 -*-
ape/data/event_ie_train.json ADDED
The diff for this file is too large to render. See raw diff
 
ape/data/intent_train.csv ADDED
The diff for this file is too large to render. See raw diff
 
ape/data/paraphase_train.csv ADDED
The diff for this file is too large to render. See raw diff
 
ape/instance.py ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*-coding:utf-8 -*-
2
+ import json
3
+ import random
4
+ import pandas as pd
5
+
6
+
7
+ class Instance(object):
8
+ """
9
+ By Default use few-shot for generation and evaluation
10
+ """
11
+
12
+ def __init__(self, loader=None):
13
+ self.samples = loader()
14
+ self.train_samples = []
15
+ self.eval_samples = []
16
+
17
+ def sample(self, n_train, n_few_shot, n_eval):
18
+ n_train = n_train * n_few_shot
19
+ if n_train + n_eval > len(self.samples):
20
+ raise ValueError(f'Train + Eval > total samples {len(self.samples)}, decrease them')
21
+
22
+ index = random.sample(list(range(len(self.samples))), n_train + n_eval)
23
+ train_index, eval_index = index[:n_train], index[n_train:]
24
+ self.train_samples = [self.samples[i] for i in train_index]
25
+ self.eval_samples = [self.samples[i] for i in train_index]
26
+
27
+ @staticmethod
28
+ def display(samples):
29
+ s = ""
30
+ for i in samples:
31
+ s += f'{i[0]} >> {i[1]}\n'
32
+ return s
33
+
34
+ @classmethod
35
+ def from_file(cls, loader):
36
+ return cls(loader)
37
+
38
+ @classmethod
39
+ def from_list(cls, tuple_list):
40
+ # 直接输入Input,Ouput List 构建Instance
41
+ def func():
42
+ return tuple_list
43
+
44
+ return cls(func)
45
+
46
+
47
+ def load_event_extraction(file='./ape/data/event_ie_train.json'):
48
+ data = []
49
+ with open(file, 'rb') as f:
50
+ for i in f.readlines():
51
+ data.append(json.loads(i))
52
+ return data
53
+
54
+
55
+ def load_paraphase(file='./ape/data/paraphase_train.csv'):
56
+ df = pd.read_csv(file, encoding='GBK')
57
+ tuple_list = []
58
+ for i in df.iterrows():
59
+ tuple_list.append((i[1][0], i[1][1]))
60
+ return tuple_list
61
+
62
+
63
+ def load_intent(file='./ape/data/intent_train.csv'):
64
+ df = pd.read_csv(file, encoding='UTF8', sep='\t')
65
+ tuple_list = []
66
+ for i in df.iterrows():
67
+ tuple_list.append((i[1][0], i[1][1]))
68
+ return tuple_list
69
+
70
+
71
+ def upload_file(file):
72
+ tuple_list = []
73
+ with open(file, 'r') as f:
74
+ for i in f.readlines():
75
+ input, output = i.split(' ')
76
+ tuple_list.append((input, output))
77
+ return tuple_list
78
+
79
+
80
+ LoadFactory = {
81
+ 'paraphase': load_paraphase,
82
+ 'event_extract': load_event_extraction,
83
+ 'search_intent': load_intent,
84
+ 'upload': upload_file
85
+ }
86
+
87
+ if __name__ == '__main__':
88
+ n_train = 2
89
+ few_shot = 3
90
+ n_eval = 2
91
+ instance1 = Instance.from_file(load_paraphase)
92
+ instance1.sample(n_train, few_shot, n_eval)
93
+ print(instance1.display(instance1.train_samples))
94
+ instance2 = Instance.from_list([('sane', 'insane'), ('direct', 'indirect'), ('informally', 'formally'),
95
+ ('unpopular', 'popular'), ('subtractive', 'additive'),
96
+ ('nonresidential', 'residential'), ('inexact', 'exact'),
97
+ ('uptown', 'downtown'), ('incomparable', 'comparable'),
98
+ ('powerful', 'powerless'), ('gaseous', 'solid'),
99
+ ('evenly', 'unevenly'), ('formality', 'informality'),
100
+ ('deliberately', 'accidentally'), ('off', 'on')])
101
+ instance2.sample(n_train, few_shot, n_eval)
102
+ print(instance2.display(instance2.train_samples))
ape/llm.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*-coding:utf-8 -*-
2
+ from tqdm import tqdm
3
+ import tiktoken
4
+ from ape.prompt import MyTemplate
5
+ from langchain.chat_models import ChatOpenAI
6
+ from langchain.llms import OpenAI
7
+ from langchain.prompts.chat import (
8
+ ChatPromptTemplate,
9
+ SystemMessagePromptTemplate,
10
+ HumanMessagePromptTemplate,
11
+ )
12
+ from langchain.prompts import PromptTemplate
13
+ from langchain.chains.llm import LLMChain
14
+
15
+ Cost = {
16
+ 'davinci': 0.02,
17
+ 'chatgpt': 0.004
18
+ }
19
+
20
+
21
+ class LLMGPT(object):
22
+ def __init__(self, openai_key, max_tokens, n_instruct, loader=None):
23
+ self.gen_llm = ChatOpenAI(openai_api_key=openai_key, max_tokens=max_tokens, temperature=0.7, n=n_instruct)
24
+ self.eval_llm = OpenAI(openai_api_key=openai_key, max_tokens=max_tokens, temperature=0.7, echo=True)
25
+ self.gen_chain = None
26
+ self.eval_chain = None
27
+ self.samples = loader()
28
+ self.init()
29
+
30
+ @staticmethod
31
+ def confirm_cost(prompt, mode):
32
+ if mode == 'train':
33
+ cost = 0.02
34
+ else:
35
+ cost = 0.0004
36
+ encoding = tiktoken.get_encoding("cl100k_base")
37
+ num_tokens = len(encoding.encode(prompt))
38
+ total_price = ((num_tokens / 1000) * cost)
39
+ return total_price
40
+
41
+ def init(self):
42
+ prompt = ChatPromptTemplate.from_messages(
43
+ [
44
+ SystemMessagePromptTemplate.from_template(MyTemplate['gen_sys_prompt']),
45
+ HumanMessagePromptTemplate.from_template(MyTemplate['gen_user_prompt']),
46
+ ]
47
+ )
48
+ self.gen_instruct_chain = LLMChain(llm=self.gen_llm, prompt=prompt)
49
+
50
+ prompt = PromptTemplate.from_template(MyTemplate['eval_prompt'])
51
+ self.eval_chain = LLMChain(llm=self.eval_llm, prompt=prompt)
52
+
53
+ def generate_instruction(self, few_shot):
54
+ """
55
+ Generate instruction
56
+ """
57
+ prompt = ''
58
+ for shot in few_shot:
59
+ prompt += MyTemplate['few_shot'].format(shot[0], shot[1])
60
+ print(prompt)
61
+ result = self.gen_instruct_chain.generate(prompt)
62
+ return result
63
+
64
+ def generate_logprobs(self, ):
65
+ """
66
+ Eval instruction
67
+ """
ape/prompt.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*-coding:utf-8 -*-
2
+ from langchain import PromptTemplate
3
+
4
+ few_shot_prompt = "Input: {input}\nOutput: {output}"
5
+
6
+ gen_user_prompt = '{few_shot}'
7
+
8
+ gen_sys_prompt = """
9
+ I want you to act as an AI assisted doctor. You are capable of answering anything related to medical. Given
10
+ a list of input-output pairs, you must come up with the correct instruction in medical-related area.
11
+ You must respond in the following format, and always respond in chinese.
12
+ ```
13
+ {{'instruction':"$YOUR_INSTRUCTION"}}
14
+ ```
15
+ Everything between the ``` must be valid json.
16
+ """
17
+
18
+ eval_prompt = "Instruction: {prompt}\nInput: {input}\nOutput: {output}"
19
+
20
+
21
+ MyTemplate = {
22
+ 'gen_user_prompt': gen_user_prompt,
23
+ 'gen_system_prompt': gen_sys_prompt,
24
+ 'eval_prompt': eval_prompt
25
+ }
app.py ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*-coding:utf-8 -*-
2
+ import os
3
+ import gradio as gr
4
+ from ape.instance import LoadFactory
5
+ from ape.prompt import MyTemplate
6
+ from ape.ape import *
7
+
8
+
9
+ with gr.Blocks(title="Automatic Prompt Engineer", css=None) as demo:
10
+ gr.Markdown("# Automatic Prompt Engineer")
11
+ openai_key = gr.Textbox(type='password', label='输入 API key')
12
+
13
+ with gr.Row():
14
+ with gr.Column(scale=2):
15
+ gr.Markdown("## Configuration")
16
+ with gr.Row():
17
+ n_train = gr.Slider(label="Number of Train", minimum=1, maximum=20, step=1, value=5)
18
+ n_few_shot = gr.Slider(label="Number of FewShot", minimum=1, maximum=20, step=1, value=5)
19
+
20
+ with gr.Row():
21
+ n_eval = gr.Slider(label="Number of Eval", minimum=5, maximum=30, step=5, value=20)
22
+ n_instruct = gr.Slider(label="Number of Prompt", minimum=1, maximum=5, step=1, value=2)
23
+
24
+ with gr.Column(scale=3):
25
+ gr.Markdown("## 加载数据集")
26
+ with gr.Tab("Choose Dataset"):
27
+ with gr.Row():
28
+ file = gr.File(label='上传txt文件,input\toutput\n', file_types=['txt'])
29
+ with gr.Row():
30
+ task = gr.Dropdown(label="Chosse Existing Task", choices=LoadFactory.keys(), value=None)
31
+ with gr.Row():
32
+ instance = gr.State()
33
+ load_button = gr.Button("Load Task")
34
+ sample_button = gr.Button('sample Data')
35
+
36
+ with gr.Tab("Display Sampled Dataset"):
37
+ with gr.Row():
38
+ train_str = gr.Textbox(max_lines=100, lines=10, label="Data for prompt generation")
39
+ eval_str = gr.Textbox(max_lines=100, lines=10, label="Data for scoring")
40
+
41
+ with gr.Row():
42
+ gr.Markdown("## Run APE")
43
+ with gr.Column(scale=2):
44
+ with gr.Row():
45
+ gr.Markdown('1. Generate Prompt')
46
+ gr.Markdown(MyTemplate['gen_sys_prompt'])
47
+ gr.Markdown('2. Evaluate Prompt')
48
+ gr.Markdown(MyTemplate['eval_prompt'])
49
+
50
+ with gr.Row():
51
+ basic_cost = gr.Textbox(lines=1, value="", label="Estimated Cost ($)", disabled=True)
52
+ basic_cost_button = gr.Button("Estimate Cost")
53
+ basic_ape_button = gr.Button("Run APE")
54
+
55
+ with gr.Column(scale=3):
56
+ with gr.Tab("APE Results"):
57
+ # Display all generated prompt with log probs
58
+ output_df = gr.DataFrame(type='pandas', headers=['Prompt', 'Likelihood'], wrap=True, interactive=False)
59
+
60
+ with gr.Tab("Prompt Overview"):
61
+ with gr.Row():
62
+ generation_prompt_sample = gr.Textbox(lines=8, value="",
63
+ label="Instruction Generation Prompts",
64
+ disabled=True)
65
+ evaluation_prompt_sample = gr.Textbox(lines=8, value="",
66
+ label="Evaluation Prompts",
67
+ disabled=True)
68
+
69
+ with gr.Tab("Test Prompt"):
70
+ # Test the output of LLM using prompt
71
+ with gr.Row():
72
+ with gr.Column(scale=1):
73
+ test_prompt = gr.Textbox(lines=4, value="",
74
+ label="Prompt to test")
75
+ test_inputs = gr.Textbox(lines=1, value="",
76
+ label="Input used to test prompt")
77
+ answer_button = gr.Button("Test")
78
+ with gr.Column(scale=1):
79
+ test_output = gr.Textbox(lines=9, value="", label="Model Output")
80
+
81
+ with gr.Tab("Eval Prompt"):
82
+ # By Default use the Evaluation Set in APE
83
+ with gr.Row():
84
+ with gr.Column(scale=1):
85
+ score_prompt = gr.Textbox(lines=3, value="",
86
+ label="Prompt to Evaluate")
87
+ compute_score_button = gr.Button("Evaluate")
88
+ with gr.Column(scale=1):
89
+ test_score = gr.Textbox(lines=1, value="", label="Log(p)", disabled=True)
90
+
91
+
92
+ """
93
+ Callback
94
+ """
95
+ # 1. 选择已有任务/上传文件,实例化Instance
96
+ load_button.click(load_task, [task, file], [instance])
97
+
98
+ # 2. 按 Configuration Sample数据 得到训练样本和验证集, 并在前端展示。支持重采样
99
+ sample_button.click(sample_data, [instance, n_train, n_few_shot, n_eval], [train_str, eval_str, instance])
100
+
101
+ # 3. Estimate Cost for train + Eval
102
+
103
+
104
+ # 4. Run APE -> 所有指令,以及指令对应的log prob
105
+
106
+ # 5. 指令单测
107
+
108
+ # 6. 人工指令打分
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ gradio
2
+ tiktoken
3
+ openai
4
+ langchain
5
+ pandas