Spaces:

xl2533
/

MakeInstruction

Runtime error

App Files Files Community

DSXiangLi commited on Apr 5, 2023

Commit

019ee78

•

1 Parent(s): f4af8ee

init

Browse files

Files changed (14) hide show

ape/__init__.py +1 -0
ape/__pycache__/__init__.cpython-38.pyc +0 -0
ape/__pycache__/prompt.cpython-38.pyc +0 -0
ape/ape.py +25 -0
ape/data/README.md +10 -0
ape/data/__init__.py +1 -0
ape/data/event_ie_train.json +0 -0
ape/data/intent_train.csv +0 -0
ape/data/paraphase_train.csv +0 -0
ape/instance.py +102 -0
ape/llm.py +67 -0
ape/prompt.py +25 -0
app.py +108 -0
requirements.txt +5 -0

ape/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # --coding:utf-8 --

ape/__pycache__/__init__.cpython-38.pyc ADDED Viewed

Binary file (125 Bytes). View file

ape/__pycache__/prompt.cpython-38.pyc ADDED Viewed

Binary file (931 Bytes). View file

ape/ape.py ADDED Viewed

	@@ -0,0 +1,25 @@

+# -*-coding:utf-8 -*-
+from ape.instance import Instance, LoadFactory
+from functools import partial
+def load_task(task, file):
+    global instance
+    if task:
+        loader = LoadFactory[task]
+        print(loader)
+    else:
+        print(file)
+        loader = partial(LoadFactory['upload'], file=[i.name for i in file][0])
+    instance = Instance.from_file(loader)
+    print(instance.samples[0])
+    return instance
+def sample_data(instance, n_train, n_few_shot, n_eval):
+    instance.sample(n_train, n_few_shot, n_eval)
+    train_str = instance.display(instance.train_samples)
+    eval_str = instance.display(instance.eval_samples)
+    return train_str, eval_str, instance

ape/data/README.md ADDED Viewed

	@@ -0,0 +1,10 @@

+## 医疗指令集数据来源
+1. 文本分类：非标准化疾病诉求数据集 https://aistudio.baidu.com/aistudio/datasetdetail/104082
+2. 意图理解：医疗搜索意图识别挑战赛 https://aistudio.baidu.com/aistudio/datasetdetail/166530
+3. 问题生成：中医文献问题生成数据集 https://tianchi.aliyun.com/dataset/86895
+4. 实体抽取：中文医学文本实体关系抽取 https://www.biendata.xyz/competition/chip_2020_2/data/
+5. 信息抽取：面向中文电子病历的医疗实体及事件抽取 https://www.biendata.xyz/competition/ccks_2021_clinic/data/
+6. Paraphase：临床术语标准化任务http://openkg.cn/dataset/yidu-n7k
+7. QA：新冠知识图谱构建与问答评测https://www.biendata.xyz/competition/ccks_2020_7_4/data/

ape/data/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # --coding:utf-8 --

ape/data/event_ie_train.json ADDED Viewed

The diff for this file is too large to render. See raw diff

ape/data/intent_train.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

ape/data/paraphase_train.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

ape/instance.py ADDED Viewed

	@@ -0,0 +1,102 @@

+# -*-coding:utf-8 -*-
+import json
+import random
+import pandas as pd
+class Instance(object):
+    """
+    By Default use few-shot for generation and evaluation
+    """
+    def __init__(self, loader=None):
+        self.samples = loader()
+        self.train_samples = []
+        self.eval_samples = []
+    def sample(self, n_train, n_few_shot, n_eval):
+        n_train = n_train * n_few_shot
+        if n_train + n_eval > len(self.samples):
+            raise ValueError(f'Train + Eval > total samples {len(self.samples)}, decrease them')
+        index = random.sample(list(range(len(self.samples))), n_train + n_eval)
+        train_index, eval_index = index[:n_train], index[n_train:]
+        self.train_samples = [self.samples[i] for i in train_index]
+        self.eval_samples = [self.samples[i] for i in train_index]
+    @staticmethod
+    def display(samples):
+        s = ""
+        for i in samples:
+            s += f'{i[0]} >> {i[1]}\n'
+        return s
+    @classmethod
+    def from_file(cls, loader):
+        return cls(loader)
+    @classmethod
+    def from_list(cls, tuple_list):
+        # 直接输入Input，Ouput List 构建Instance
+        def func():
+            return tuple_list
+        return cls(func)
+def load_event_extraction(file='./ape/data/event_ie_train.json'):
+    data = []
+    with open(file, 'rb') as f:
+        for i in f.readlines():
+            data.append(json.loads(i))
+    return data
+def load_paraphase(file='./ape/data/paraphase_train.csv'):
+    df = pd.read_csv(file, encoding='GBK')
+    tuple_list = []
+    for i in df.iterrows():
+        tuple_list.append((i[1][0], i[1][1]))
+    return tuple_list
+def load_intent(file='./ape/data/intent_train.csv'):
+    df = pd.read_csv(file, encoding='UTF8', sep='\t')
+    tuple_list = []
+    for i in df.iterrows():
+        tuple_list.append((i[1][0], i[1][1]))
+    return tuple_list
+def upload_file(file):
+    tuple_list = []
+    with open(file, 'r') as f:
+        for i in f.readlines():
+            input, output = i.split(' ')
+            tuple_list.append((input, output))
+    return tuple_list
+LoadFactory = {
+    'paraphase': load_paraphase,
+    'event_extract': load_event_extraction,
+    'search_intent': load_intent,
+    'upload': upload_file
+}
+if __name__ == '__main__':
+    n_train = 2
+    few_shot = 3
+    n_eval = 2
+    instance1 = Instance.from_file(load_paraphase)
+    instance1.sample(n_train, few_shot, n_eval)
+    print(instance1.display(instance1.train_samples))
+    instance2 = Instance.from_list([('sane', 'insane'), ('direct', 'indirect'), ('informally', 'formally'),
+                                    ('unpopular', 'popular'), ('subtractive', 'additive'),
+                                    ('nonresidential', 'residential'), ('inexact', 'exact'),
+                                    ('uptown', 'downtown'), ('incomparable', 'comparable'),
+                                    ('powerful', 'powerless'), ('gaseous', 'solid'),
+                                    ('evenly', 'unevenly'), ('formality', 'informality'),
+                                    ('deliberately', 'accidentally'), ('off', 'on')])
+    instance2.sample(n_train, few_shot, n_eval)
+    print(instance2.display(instance2.train_samples))

ape/llm.py ADDED Viewed

	@@ -0,0 +1,67 @@

+# -*-coding:utf-8 -*-
+from tqdm import tqdm
+import tiktoken
+from ape.prompt import MyTemplate
+from langchain.chat_models import ChatOpenAI
+from langchain.llms import OpenAI
+from langchain.prompts.chat import (
+    ChatPromptTemplate,
+    SystemMessagePromptTemplate,
+    HumanMessagePromptTemplate,
+)
+from langchain.prompts import PromptTemplate
+from langchain.chains.llm import LLMChain
+Cost = {
+    'davinci': 0.02,
+    'chatgpt': 0.004
+}
+class LLMGPT(object):
+    def __init__(self, openai_key, max_tokens, n_instruct, loader=None):
+        self.gen_llm = ChatOpenAI(openai_api_key=openai_key, max_tokens=max_tokens, temperature=0.7, n=n_instruct)
+        self.eval_llm = OpenAI(openai_api_key=openai_key, max_tokens=max_tokens, temperature=0.7, echo=True)
+        self.gen_chain = None
+        self.eval_chain = None
+        self.samples = loader()
+        self.init()
+    @staticmethod
+    def confirm_cost(prompt, mode):
+        if mode == 'train':
+            cost = 0.02
+        else:
+            cost = 0.0004
+        encoding = tiktoken.get_encoding("cl100k_base")
+        num_tokens = len(encoding.encode(prompt))
+        total_price = ((num_tokens / 1000) * cost)
+        return total_price
+    def init(self):
+        prompt = ChatPromptTemplate.from_messages(
+            [
+                SystemMessagePromptTemplate.from_template(MyTemplate['gen_sys_prompt']),
+                HumanMessagePromptTemplate.from_template(MyTemplate['gen_user_prompt']),
+            ]
+        )
+        self.gen_instruct_chain = LLMChain(llm=self.gen_llm, prompt=prompt)
+        prompt = PromptTemplate.from_template(MyTemplate['eval_prompt'])
+        self.eval_chain = LLMChain(llm=self.eval_llm, prompt=prompt)
+    def generate_instruction(self, few_shot):
+        """
+        Generate instruction
+        """
+        prompt = ''
+        for shot in few_shot:
+            prompt += MyTemplate['few_shot'].format(shot[0], shot[1])
+        print(prompt)
+        result = self.gen_instruct_chain.generate(prompt)
+        return result
+    def generate_logprobs(self, ):
+        """
+        Eval instruction
+        """

ape/prompt.py ADDED Viewed

	@@ -0,0 +1,25 @@

+# -*-coding:utf-8 -*-
+from langchain import PromptTemplate
+few_shot_prompt = "Input: {input}\nOutput: {output}"
+gen_user_prompt = '{few_shot}'
+gen_sys_prompt = """
+                I want you to act as an AI assisted doctor. You are capable of answering anything related to medical. Given
+                a list of input-output pairs, you must come up with the correct instruction in medical-related area.
+                You must respond in the following format, and always respond in chinese.
+                ```
+                {{'instruction':"$YOUR_INSTRUCTION"}}
+                ```
+                Everything between the ``` must be valid json.
+                """
+eval_prompt = "Instruction: {prompt}\nInput: {input}\nOutput: {output}"
+MyTemplate = {
+    'gen_user_prompt': gen_user_prompt,
+    'gen_system_prompt': gen_sys_prompt,
+    'eval_prompt': eval_prompt
+}

app.py ADDED Viewed

	@@ -0,0 +1,108 @@

+# -*-coding:utf-8 -*-
+import os
+import gradio as gr
+from ape.instance import LoadFactory
+from ape.prompt import MyTemplate
+from ape.ape import *
+with gr.Blocks(title="Automatic Prompt Engineer", css=None) as demo:
+    gr.Markdown("# Automatic Prompt Engineer")
+    openai_key = gr.Textbox(type='password', label='输入 API key')
+    with gr.Row():
+        with gr.Column(scale=2):
+            gr.Markdown("## Configuration")
+            with gr.Row():
+                n_train = gr.Slider(label="Number of Train", minimum=1, maximum=20, step=1, value=5)
+                n_few_shot = gr.Slider(label="Number of FewShot", minimum=1, maximum=20, step=1, value=5)
+            with gr.Row():
+                n_eval = gr.Slider(label="Number of Eval", minimum=5, maximum=30, step=5, value=20)
+                n_instruct = gr.Slider(label="Number of Prompt", minimum=1, maximum=5, step=1, value=2)
+        with gr.Column(scale=3):
+            gr.Markdown("## 加载数据集")
+            with gr.Tab("Choose Dataset"):
+                with gr.Row():
+                    file = gr.File(label='上传txt文件，input\toutput\n', file_types=['txt'])
+                with gr.Row():
+                    task = gr.Dropdown(label="Chosse Existing Task", choices=LoadFactory.keys(), value=None)
+                with gr.Row():
+                    instance = gr.State()
+                    load_button = gr.Button("Load Task")
+                    sample_button = gr.Button('sample Data')
+            with gr.Tab("Display Sampled Dataset"):
+                with gr.Row():
+                    train_str = gr.Textbox(max_lines=100, lines=10, label="Data for prompt generation")
+                    eval_str = gr.Textbox(max_lines=100, lines=10, label="Data for scoring")
+    with gr.Row():
+        gr.Markdown("## Run APE")
+        with gr.Column(scale=2):
+            with gr.Row():
+                gr.Markdown('1. Generate Prompt')
+                gr.Markdown(MyTemplate['gen_sys_prompt'])
+                gr.Markdown('2. Evaluate Prompt')
+                gr.Markdown(MyTemplate['eval_prompt'])
+            with gr.Row():
+                basic_cost = gr.Textbox(lines=1, value="", label="Estimated Cost ($)", disabled=True)
+                basic_cost_button = gr.Button("Estimate Cost")
+                basic_ape_button = gr.Button("Run APE")
+        with gr.Column(scale=3):
+            with gr.Tab("APE Results"):
+                # Display all generated prompt with log probs
+                output_df = gr.DataFrame(type='pandas', headers=['Prompt', 'Likelihood'], wrap=True, interactive=False)
+            with gr.Tab("Prompt Overview"):
+                with gr.Row():
+                    generation_prompt_sample = gr.Textbox(lines=8, value="",
+                                                          label="Instruction Generation Prompts",
+                                                          disabled=True)
+                    evaluation_prompt_sample = gr.Textbox(lines=8, value="",
+                                                          label="Evaluation Prompts",
+                                                          disabled=True)
+            with gr.Tab("Test Prompt"):
+                # Test the output of LLM using prompt
+                with gr.Row():
+                    with gr.Column(scale=1):
+                        test_prompt = gr.Textbox(lines=4, value="",
+                                                 label="Prompt to test")
+                        test_inputs = gr.Textbox(lines=1, value="",
+                                                 label="Input used to test prompt")
+                        answer_button = gr.Button("Test")
+                    with gr.Column(scale=1):
+                        test_output = gr.Textbox(lines=9, value="", label="Model Output")
+            with gr.Tab("Eval Prompt"):
+                # By Default use the Evaluation Set in APE
+                with gr.Row():
+                    with gr.Column(scale=1):
+                        score_prompt = gr.Textbox(lines=3, value="",
+                                                  label="Prompt to Evaluate")
+                        compute_score_button = gr.Button("Evaluate")
+                    with gr.Column(scale=1):
+                        test_score = gr.Textbox(lines=1, value="", label="Log(p)", disabled=True)
+    """
+    Callback
+    """
+    # 1. 选择已有任务/上传文件，实例化Instance
+    load_button.click(load_task, [task, file], [instance])
+    # 2. 按 Configuration Sample数据 得到训练样本和验证集, 并在前端展示。支持重采样
+    sample_button.click(sample_data, [instance, n_train, n_few_shot, n_eval], [train_str, eval_str, instance])
+    # 3. Estimate Cost for train + Eval
+    # 4. Run APE -> 所有指令，以及指令对应的log prob
+    # 5. 指令单测
+    # 6. 人工指令打分

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+gradio
+tiktoken
+openai
+langchain
+pandas