File size: 5,016 Bytes
0db3dd6
 
 
a7d425a
0db3dd6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20c8875
a6cc512
11fdf65
0db3dd6
13a67fb
0db3dd6
 
 
 
 
 
 
 
 
 
b7f85ba
0db3dd6
 
 
 
 
 
 
 
 
b7f85ba
0db3dd6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16c293e
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
import gradio as gr
from llmlingua import PromptCompressor

llm_lingua = PromptCompressor("lgaalves/gpt2-dolly", device_map="cpu")

INTRO = """
# LLMLingua: Compressing Prompts for Accelerated Inference of Large Language Models
This is an early demo of the prompt compression method LLMLingua.
To use it, upload your prompt and set the compression target.
1. ✅ Set the different components of the prompt separately, including instruction, context, and question. Leave the corresponding field empty if a particular component does not exist.
    - Question: This refers to the directives given by the user to the LLMs, such as inquiries, questions, or requests. Positioned after the instruction and context modules, the question module has a high sensitivity to compression.
    - Context: This module provides the supplementary context needed to address the question, such as documents, demonstrations, web search results, or API call results. Located between the instruction and question modules, its sensitivity to compression is relatively low.
    - Instruction: This module consists of directives given by the user to the LLMs, such as task descriptions. Placed before the instruction and context modules, the instruction module exhibits a high sensitivity to compression.
2. ✅ Set the target_token or compression ratio.
3. 🤔 Try experimenting with different target compression ratios or other hyperparameters to optimize the performance.

You can check our [repo](https://aka.ms/LLMLingua)!
"""

custom_css = """
    #image-upload {
        flex-grow: 1;
    }
    #params .tabs {
        display: flex;
        flex-direction: column;
        flex-grow: 1;
    }
    #params .tabitem[style="display: block;"] {
        flex-grow: 1;
        display: flex !important;
    }
    #params .gap {
        flex-grow: 1;
    }
    #params .form {
        flex-grow: 1 !important;
    }
    #params .form > :last-child{
        flex-grow: 1;
    }
    .md ol, .md ul {
        margin-left: 1rem;
    }
    .md img {
        margin-bottom: 1rem;
    }
"""

def compress_prompt(context, instruction, question, ratio, target_token):
    context, instruction, question = context.replace("\\n", "\n"), instruction.replace("\\n", "\n"), question.replace("\\n", "\n")
    compressed_prompt = llm_lingua.compress_prompt(context.split("\n\n"), instruction, question, float(ratio), float(target_token))

    return [compressed_prompt[key] for key in ["compressed_prompt", "origin_tokens", "compressed_tokens", "ratio", "saving"]]


with gr.Blocks(css=custom_css) as iface:
    gr.Markdown(INTRO)

    with gr.Row():
        with gr.Column(elem_id="params"):
            with gr.Tab('Prompts'):
                instruction = gr.Textbox(
                    label="Instruction",
                    lines=1,
                    value="",
                )
                context = gr.Textbox(
                    label="Context",
                    lines=3,
                    value="",
                )
                question = gr.Textbox(
                    label="Question",
                    lines=1,
                    value="",
                )
            with gr.Tab('Compression Target'):
                ratio = gr.Textbox(
                    label="Compression Ratio",
                    value=0.5,
                )
                target_token = gr.Textbox(
                    label="Target Token",
                    value=-1,
                )

    gen_button = gr.Button(value="Compress Prompt!", variant="primary")

    # with gr.Tab('Results'):
    #     results = gr.Gallery(
    #         show_label=False,
    #         object_fit="contain",
    #         columns=4
    #     )

    with gr.Row():
        with gr.Column(elem_id="Results"):
            with gr.Tab('Compressed Prompts'):
                compressed_prompt = gr.Textbox(
                    label="compressed_prompt",
                    lines=5,
                )
            with gr.Tab('Saving'):
                origin_tokens = gr.Textbox(
                    label="The tokens number of original prompt",
                )
                compressed_tokens = gr.Textbox(
                    label="The tokens number of compressed prompt",
                )
                saving_ratio = gr.Textbox(
                    label="Actual Compression Ratio",
                )
                saving = gr.Textbox(
                    label="Saving Cost",
                )


    # gr.Examples(
    #     examples=EXAMPLES,
    #     inputs=[image_upload, positive_prompt, negative_prompt],
    # )

    gen_button.click(
        fn=compress_prompt,
        inputs=[
            context,
            instruction,
            question,
            ratio,
            target_token
        ],
        outputs=[
            compressed_prompt,
            origin_tokens,
            compressed_tokens,
            saving_ratio,
            saving
        ],
    )

iface.queue(max_size=10, api_open=False).launch(show_api=False)