Spaces:
Running
Running
File size: 5,316 Bytes
0db3dd6 a7d425a 0db3dd6 85cbdde 0db3dd6 20c8875 a6cc512 11fdf65 0db3dd6 13a67fb 0db3dd6 4d19dd8 0db3dd6 b7f85ba 0db3dd6 b7f85ba 0db3dd6 4d19dd8 0db3dd6 edb744b 0db3dd6 4d19dd8 0db3dd6 4d19dd8 0db3dd6 16c293e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 |
import gradio as gr
from llmlingua import PromptCompressor
llm_lingua = PromptCompressor("lgaalves/gpt2-dolly", device_map="cpu")
INTRO = """
# LLMLingua: Compressing Prompts for Accelerated Inference of Large Language Models
This is an early demo of the prompt compression method LLMLingua.
It should be noted that due to limited resources, we only provide the **GPT2-Small** size language model in this demo. Using the **LLaMA2-7B** as a small language model would result in a significant performance improvement, especially at high compression ratios.
To use it, upload your prompt and set the compression target.
1. β
Set the different components of the prompt separately, including instruction, context, and question. Leave the corresponding field empty if a particular component does not exist.
- Question: This refers to the directives given by the user to the LLMs, such as inquiries, questions, or requests. Positioned after the instruction and context modules, the question module has a high sensitivity to compression.
- Context: This module provides the supplementary context needed to address the question, such as documents, demonstrations, web search results, or API call results. Located between the instruction and question modules, its sensitivity to compression is relatively low.
- Instruction: This module consists of directives given by the user to the LLMs, such as task descriptions. Placed before the instruction and context modules, the instruction module exhibits a high sensitivity to compression.
2. β
Set the target_token or compression ratio.
3. π€ Try experimenting with different target compression ratios or other hyperparameters to optimize the performance.
You can check our [repo](https://aka.ms/LLMLingua)!
"""
custom_css = """
#image-upload {
flex-grow: 1;
}
#params .tabs {
display: flex;
flex-direction: column;
flex-grow: 1;
}
#params .tabitem[style="display: block;"] {
flex-grow: 1;
display: flex !important;
}
#params .gap {
flex-grow: 1;
}
#params .form {
flex-grow: 1 !important;
}
#params .form > :last-child{
flex-grow: 1;
}
.md ol, .md ul {
margin-left: 1rem;
}
.md img {
margin-bottom: 1rem;
}
"""
def compress_prompt(context, instruction, question, ratio, target_token):
context, instruction, question = context.replace("\\n", "\n"), instruction.replace("\\n", "\n"), question.replace("\\n", "\n")
compressed_prompt = llm_lingua.compress_prompt(context.split("\n\n"), instruction, question, float(ratio), float(target_token))
return [compressed_prompt[key] for key in ["compressed_prompt", "origin_tokens", "compressed_tokens", "ratio", "saving"]]
with gr.Blocks(css=custom_css) as iface:
gr.Markdown(INTRO)
with gr.Row():
with gr.Column(elem_id="prompt", scale=2):
with gr.Tab('Prompts'):
instruction = gr.Textbox(
label="Instruction",
lines=1,
value="",
)
context = gr.Textbox(
label="Context",
lines=3,
value="",
)
question = gr.Textbox(
label="Question",
lines=1,
value="",
)
with gr.Column(elem_id="params", scale=1):
with gr.Tab('Compression Target'):
target_token = gr.Textbox(
label="Target Token (To use this, set Compression Ratio to 0)",
value=200,
)
ratio = gr.Textbox(
label="Compression Ratio (To use this, set Target Token to -1)",
value=0,
)
gen_button = gr.Button(value="Compress Prompt!", variant="primary")
with gr.Row():
with gr.Column(elem_id="Results", scale=2):
with gr.Tab('Compressed Prompts'):
compressed_prompt = gr.Textbox(
label="compressed_prompt",
lines=5,
)
with gr.Column(elem_id="Results_2", scale=1):
with gr.Tab('Saving'):
origin_tokens = gr.Textbox(
label="The tokens number of original prompt",
)
compressed_tokens = gr.Textbox(
label="The tokens number of compressed prompt",
)
saving_ratio = gr.Textbox(
label="Actual Compression Ratio",
)
saving = gr.Textbox(
label="Saving Cost",
)
# gr.Examples(
# examples=EXAMPLES,
# inputs=[image_upload, positive_prompt, negative_prompt],
# )
gen_button.click(
fn=compress_prompt,
inputs=[
context,
instruction,
question,
ratio,
target_token
],
outputs=[
compressed_prompt,
origin_tokens,
compressed_tokens,
saving_ratio,
saving
],
)
iface.queue(max_size=10, api_open=False).launch(show_api=False) |