File size: 8,211 Bytes
0db3dd6
 
 
a7d425a
0db3dd6
 
 
 
85cbdde
 
 
0db3dd6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bd02cc7
 
ba5df81
bd02cc7
 
ba5df81
bd02cc7
 
 
 
 
ba5df81
bd02cc7
 
 
 
 
ba5df81
bd02cc7
 
 
20c8875
a6cc512
11fdf65
0db3dd6
13a67fb
0db3dd6
 
 
 
 
 
4d19dd8
0db3dd6
 
 
b7f85ba
0db3dd6
ba5df81
0db3dd6
 
 
 
 
ba5df81
0db3dd6
 
 
b7f85ba
0db3dd6
ba5df81
0db3dd6
4d19dd8
0db3dd6
 
edb744b
 
 
 
 
 
0db3dd6
 
 
 
 
0ca26f4
0db3dd6
 
 
ba5df81
0db3dd6
4d19dd8
0db3dd6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0ca26f4
 
 
 
ba5df81
0ca26f4
 
 
ba5df81
 
0ca26f4
0db3dd6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16c293e
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
import gradio as gr
from llmlingua import PromptCompressor

llm_lingua = PromptCompressor("lgaalves/gpt2-dolly", device_map="cpu")

INTRO = """
# LLMLingua: Compressing Prompts for Accelerated Inference of Large Language Models
This is an early demo of the prompt compression method LLMLingua.

It should be noted that due to limited resources, we only provide the **GPT2-Small** size language model in this demo. Using the **LLaMA2-7B** as a small language model would result in a significant performance improvement, especially at high compression ratios.

To use it, upload your prompt and set the compression target.
1. ✅ Set the different components of the prompt separately, including instruction, context, and question. Leave the corresponding field empty if a particular component does not exist.
    - Question: This refers to the directives given by the user to the LLMs, such as inquiries, questions, or requests. Positioned after the instruction and context modules, the question module has a high sensitivity to compression.
    - Context: This module provides the supplementary context needed to address the question, such as documents, demonstrations, web search results, or API call results. Located between the instruction and question modules, its sensitivity to compression is relatively low.
    - Instruction: This module consists of directives given by the user to the LLMs, such as task descriptions. Placed before the instruction and context modules, the instruction module exhibits a high sensitivity to compression.
2. ✅ Set the target_token or compression ratio.
3. 🤔 Try experimenting with different target compression ratios or other hyperparameters to optimize the performance.

You can check our [repo](https://aka.ms/LLMLingua)!
"""

custom_css = """
    #image-upload {
        flex-grow: 1;
    }
    #params .tabs {
        display: flex;
        flex-direction: column;
        flex-grow: 1;
    }
    #params .tabitem[style="display: block;"] {
        flex-grow: 1;
        display: flex !important;
    }
    #params .gap {
        flex-grow: 1;
    }
    #params .form {
        flex-grow: 1 !important;
    }
    #params .form > :last-child{
        flex-grow: 1;
    }
    .md ol, .md ul {
        margin-left: 1rem;
    }
    .md img {
        margin-bottom: 1rem;
    }
"""

EXAMPLES = [
    [
        "lgaalves/gpt2-dolly",
        "8.7x",
        "",
        "Question: can buy 4 1melon for You bought 36 fruits evenly split between of 1 $. does cost if bill $\n's think step\nIf between 3 then I 363 = 12 of fruit 1 orange then oranges506If I oranges I $66 $60 on the 2 fruit\n the of is, and that price and is 1W4AIf we know we bought 12 and 12W\n thatW can the 12 = 48\n of apple (60/The 1\n: Sam a dozen boxes with 30ighter pens each Heanged into six3 the separately of three. much in\n's boxes $120 12 =Sam then took 5 boxes × 6 highlighters/box = 30 highlighters.\nHe sold these boxes for 5 * $3 = $15\nAfter selling these 5 boxes there were 360 - 30 = 330 highlighters remaining.\nThese form 330 / 3 = 110 groups of three pens.\nHe sold each of these groups for $2 each, so made 110 * 2 = $220 from them.\nIn total, then, he earned $220 + $15 = $235.\nSince his original cost was $120, he earned $235 - $120 = $115 in profit.\nThe answer is 115",
    ],
    [
        "vicgalle/alpaca-7b",
        "13.8x",
        "78.32",
        "Question: Sam bought a dozen boxes, each 30 highl pens inside, $10 each. He reanged five of boxes into of sixlters each sold $3. He sold the theters separately at the of three $2. How much did make in total, in\nLets think step\nSam bought  boxes x0 = $10 oflters.\nHe 2 300ters in\nSam then 5 boxes 6ters0ters\nHe sold these boxes for 55\nAfterelling these  boxes there300lters remaining\nThese form 330 310 of three pens\nHe sold each of these groups for2 each, so made 0 *0 from\nIn total, he $ $155\nSince his original $1, he earned $20 = $115 in profit.\nThe answer is 115\n\n",
    ],
    [
        "vicgalle/alpaca-7b",
        "20.2x",
        "77.94",
        "Question: Sam bought a dozen boxes, each with 30 highl pens inside, for $10 each.\nHe reanged five of boxes into of sixlters each sold them $3 per package.\nHe sold the rest of thelters separately at the of three pens for $2.\nHow much profit did make in total, in dollars\nLet's think step by step\nSam then took 5 boxes × 6lighters/box = 30 highlighters.\nThese form 330 / 3 = 110 groups of three pens.\nThe answer is 115\n\n",
    ],
]

def compress_prompt(context, instruction, question, ratio, target_token):
    context, instruction, question = context.replace("\\n", "\n"), instruction.replace("\\n", "\n"), question.replace("\\n", "\n")
    compressed_prompt = llm_lingua.compress_prompt(context.split("\n\n"), instruction, question, float(ratio), float(target_token))

    return [compressed_prompt[key] for key in ["compressed_prompt", "origin_tokens", "compressed_tokens", "ratio", "saving"]]


with gr.Blocks(css=custom_css) as iface:
    gr.Markdown(INTRO)

    with gr.Row():
        with gr.Column(elem_id="prompt", scale=2):
            with gr.Tab('Prompts'):
                instruction = gr.Textbox(
                    label="Instruction",
                    lines=1,
                    value="",
                    placeholder="This module consists of directives given by the user to the LLMs, such as task descriptions.",
                )
                context = gr.Textbox(
                    label="Context",
                    lines=3,
                    value="",
                    placeholder="This module provides the supplementary context needed to address the question, such as documents, demonstrations, web search results, or API call results.",
                )
                question = gr.Textbox(
                    label="Question",
                    lines=1,
                    value="",
                    placeholder="This refers to the directives given by the user to the LLMs, such as inquiries, questions, or requests.",
                )
        with gr.Column(elem_id="params", scale=1):
            with gr.Tab('Compression Target'):
                target_token = gr.Textbox(
                    label="Target Token (To use this, set Compression Ratio to 0)",
                    value=200,
                )
                ratio = gr.Textbox(
                    label="Compression Ratio (To use this, set Target Token to -1)",
                    value=0,
                )

    gen_button = gr.Button(value="Compress Prompt!", variant="primary")

    with gr.Row():
        with gr.Column(elem_id="Results", scale=1):
            with gr.Tab('Compressed Prompts'):
                compressed_prompt = gr.Textbox(
                    label="compressed_prompt",
                    lines=10,
                )
        with gr.Column(elem_id="Results_2", scale=1):
            with gr.Tab('Saving'):
                origin_tokens = gr.Textbox(
                    label="The tokens number of original prompt",
                )
                compressed_tokens = gr.Textbox(
                    label="The tokens number of compressed prompt",
                )
                saving_ratio = gr.Textbox(
                    label="Actual Compression Ratio",
                )
                saving = gr.Textbox(
                    label="Saving Cost",
                )


    # gr.Examples(
    #     examples=EXAMPLES,
    #     inputs=[model_name, compressed_prompt, saving_ratio, acc],
    # )
    gr.Markdown("## Examples in GSM8K")

    gr.Dataframe(
        value=EXAMPLES,
        headers=["Small Language Model", "Compression Ratio", "GSM8K Acc using GPT-3.5-Turbo", "Compressed Prompts",],
        datatype=["str", "str", "str", "str"],
    ),

    gen_button.click(
        fn=compress_prompt,
        inputs=[
            context,
            instruction,
            question,
            ratio,
            target_token
        ],
        outputs=[
            compressed_prompt,
            origin_tokens,
            compressed_tokens,
            saving_ratio,
            saving
        ],
    )

iface.queue(max_size=10, api_open=False).launch(show_api=False)