Spaces:

llm-blender
/

LLM-Blender

Runtime error

App Files Files Community

DongfuJiang commited on Nov 11, 2023

Commit

bf79ee8

•

1 Parent(s): fcac78f

update

Browse files

Files changed (2) hide show

app.py +410 -233
descriptions.py +49 -0

app.py CHANGED Viewed

@@ -1,10 +1,14 @@
 import gradio as gr
 import sys
 import os
-import zipfile
 from datasets import load_dataset
 from typing import List
 MAX_BASE_LLM_NUM = 20
 MIN_BASE_LLM_NUM = 3
 SOURCE_MAX_LENGTH = 256
@@ -13,10 +17,6 @@ CANDIDATE_MAX_LENGTH = 256
 DEFAULT_CANDIDATE_MAX_LENGTH = 128
 FUSER_MAX_NEW_TOKENS = 512
 DEFAULT_FUSER_MAX_NEW_TOKENS = 256
-DESCRIPTIONS = """# LLM-BLENDER
-LLM-Blender is an innovative ensembling framework to attain consistently superior performance by leveraging the diverse strengths of multiple open-source large language models (LLMs). LLM-Blender cut the weaknesses through ranking and integrate the strengths through fusing generation to enhance the capability of LLMs.
-"""
 EXAMPLES_DATASET = load_dataset("llm-blender/mix-instruct", split='validation', streaming=True)
 SHUFFLED_EXAMPLES_DATASET = EXAMPLES_DATASET.shuffle(seed=42, buffer_size=1000)
 EXAMPLES = []
@@ -28,47 +28,75 @@ for example in SHUFFLED_EXAMPLES_DATASET.take(100):
     ])
     CANDIDATE_EXAMPLES[example['instruction']+example['input']] = example['candidates']
-# Download ranker checkpoint
-if not os.path.exists("pairranker-deberta-v3-large.zip"):
-    os.system("gdown https://drive.google.com/uc?id=1EpvFu_qYY0MaIu0BAAhK-sYKHVWtccWg")
-if not os.path.exists("pairranker-deberta-v3-large"):
-    with zipfile.ZipFile("pairranker-deberta-v3-large.zip", 'r') as zip_ref:
-        zip_ref.extractall(".")
-# Load Blender
-import llm_blender
-from llm_blender.blender.blender_utils import get_topk_candidates_from_ranks
-ranker_config = llm_blender.RankerConfig()
-ranker_config.ranker_type = "pairranker"
-ranker_config.model_type = "deberta"
-ranker_config.model_name = "microsoft/deberta-v3-large" # ranker backbone
-ranker_config.load_checkpoint = "./pairranker-deberta-v3-large" # ranker checkpoint <your checkpoint path>
-ranker_config.source_maxlength = DEFAULT_SOURCE_MAX_LENGTH
-ranker_config.candidate_maxlength = DEFAULT_CANDIDATE_MAX_LENGTH
-ranker_config.n_tasks = 1 # number of singal that has been used to train the ranker. This checkpoint is trained using BARTScore only, thus being 1.
-fuser_config = llm_blender.GenFuserConfig()
-fuser_config.model_name = "llm-blender/gen_fuser_3b" # our pre-trained fuser
-fuser_config.max_length = 1024
-fuser_config.candidate_maxlength = DEFAULT_CANDIDATE_MAX_LENGTH
-blender_config = llm_blender.BlenderConfig()
-blender_config.load_in_8bit = True
-blender_config.device = "cuda" # blender ranker and fuser device
-blender = llm_blender.Blender(blender_config, ranker_config, fuser_config)
 def update_base_llms_num(k, llm_outputs):
     k = int(k)
-    return [gr.Dropdown.update(choices=[f"LLM-{i+1}" for i in range(k)],
         value=f"LLM-1" if k >= 1 else "", visible=True),
         {f"LLM-{i+1}": llm_outputs.get(f"LLM-{i+1}", "") for i in range(k)}]
 def display_llm_output(llm_outputs, selected_base_llm_name):
-    return gr.Textbox.update(value=llm_outputs.get(selected_base_llm_name, ""),
         label=selected_base_llm_name + " (Click Save to save current content)",
         placeholder=f"Enter {selected_base_llm_name} output here", show_label=True)
 def save_llm_output(selected_base_llm_name, selected_base_llm_output, llm_outputs):
-    llm_outputs.update({selected_base_llm_name: selected_base_llm_output})
     return llm_outputs
 def get_preprocess_examples(inst, input):
@@ -131,217 +159,366 @@ def display_fuser_output(fuser_output):
 with gr.Blocks(theme='ParityError/Anime') as demo:
-    gr.Markdown(DESCRIPTIONS)
-    gr.Markdown("## Input and Base LLMs")
-    with gr.Row():
-        with gr.Column():
-            inst_textbox = gr.Textbox(lines=1, label="Instruction", placeholder="Enter instruction here", show_label=True)
-            input_textbox = gr.Textbox(lines=4, label="Input Context", placeholder="Enter input context here", show_label=True)
-        with gr.Column():
-            saved_llm_outputs = gr.State(value={})
-            with gr.Group():
-                selected_base_llm_name_dropdown = gr.Dropdown(label="Base LLM",
-                    choices=[f"LLM-{i+1}" for i in range(MIN_BASE_LLM_NUM)], value="LLM-1", show_label=True)
-                selected_base_llm_output = gr.Textbox(lines=4, label="LLM-1 (Click Save to save current content)",
-                    placeholder="Enter LLM-1 output here", show_label=True)
-            with gr.Row():
-                base_llm_outputs_save_button = gr.Button('Save', variant='primary')
-                base_llm_outputs_clear_single_button = gr.Button('Clear Single', variant='primary')
-                base_llm_outputs_clear_all_button = gr.Button('Clear All', variant='primary')
-            base_llms_num = gr.Slider(
-                    label='Number of base llms',
-                    minimum=MIN_BASE_LLM_NUM,
-                    maximum=MAX_BASE_LLM_NUM,
-                    step=1,
-                    value=MIN_BASE_LLM_NUM,
-                )
-    blender_state = gr.State(value={})
-    saved_rank_outputs = gr.State(value=[])
-    saved_fuse_outputs = gr.State(value=[])
-    gr.Markdown("## Blender Outputs")
-    with gr.Group():
-        rank_outputs = gr.Textbox(lines=1, label="Ranks of each LLM's output", placeholder="Ranking outputs", show_label=True)
-        fuser_outputs = gr.Textbox(lines=4, label="Fusing outputs", placeholder="Fusing outputs", show_label=True)
-    with gr.Row():
-        rank_button = gr.Button('Rank LLM Outputs', variant='primary')
-        fuse_button = gr.Button('Fuse Top-K ranked outputs', variant='primary')
-        clear_button = gr.Button('Clear Blender Outputs', variant='primary')
-    blender_config = gr.State(value={
-        "source_max_length": DEFAULT_SOURCE_MAX_LENGTH,
-        "candidate_max_length": DEFAULT_CANDIDATE_MAX_LENGTH,
-        "top_k_for_fuser": 3,
-        "max_new_tokens": DEFAULT_FUSER_MAX_NEW_TOKENS,
-        "temperature": 0.7,
-        "top_p": 1.0,
-    })
-    with gr.Accordion(label='Advanced options', open=False):
-        top_k_for_fuser = gr.Slider(
-            label='Top-k ranked candidates to fuse',
-            minimum=1,
-            maximum=3,
-            step=1,
-            value=3,
         )
-        source_max_length = gr.Slider(
-            label='Max length of Instruction + Input',
-            minimum=1,
-            maximum=SOURCE_MAX_LENGTH,
-            step=1,
-            value=DEFAULT_SOURCE_MAX_LENGTH,
         )
-        candidate_max_length = gr.Slider(
-            label='Max length of LLM-Output Candidate',
-            minimum=1,
-            maximum=CANDIDATE_MAX_LENGTH,
-            step=1,
-            value=DEFAULT_CANDIDATE_MAX_LENGTH,
         )
-        max_new_tokens = gr.Slider(
-            label='Max new tokens fuser can generate',
-            minimum=1,
-            maximum=FUSER_MAX_NEW_TOKENS,
-            step=1,
-            value=DEFAULT_FUSER_MAX_NEW_TOKENS,
         )
-        # temperature = gr.Slider(
-        #     label='Temperature of fuser generation',
-        #     minimum=0.1,
-        #     maximum=2.0,
-        #     step=0.1,
-        #     value=0.7,
-        # )
-        # top_p = gr.Slider(
-        #     label='Top-p of fuser generation',
-        #     minimum=0.05,
-        #     maximum=1.0,
-        #     step=0.05,
-        #     value=1.0,
-        # )
-        beam_size = gr.Slider(
-            label='Beam size of fuser generation',
-            minimum=1,
-            maximum=10,
-            step=1,
-            value=4,
         )
-    examples_dummy_textbox = gr.Textbox(lines=1, label="", placeholder="", show_label=False, visible=False)
-    batch_examples = gr.Examples(
-        examples=EXAMPLES,
-        fn=get_preprocess_examples,
-        cache_examples=True,
-        examples_per_page=5,
-        inputs=[inst_textbox, input_textbox],
-        outputs=[inst_textbox, input_textbox, base_llms_num, examples_dummy_textbox],
-    )
-    base_llms_num.change(
-        fn=update_base_llms_num,
-        inputs=[base_llms_num, saved_llm_outputs],
-        outputs=[selected_base_llm_name_dropdown, saved_llm_outputs],
-    )
-    examples_dummy_textbox.change(
-        fn=update_base_llm_dropdown_along_examples,
-        inputs=[examples_dummy_textbox],
-        outputs=[saved_llm_outputs, rank_outputs, fuser_outputs],
-    ).then(
-        fn=display_llm_output,
-        inputs=[saved_llm_outputs, selected_base_llm_name_dropdown],
-        outputs=selected_base_llm_output,
-    )
-    selected_base_llm_name_dropdown.change(
-        fn=display_llm_output,
-        inputs=[saved_llm_outputs, selected_base_llm_name_dropdown],
-        outputs=selected_base_llm_output,
-    )
-    base_llm_outputs_save_button.click(
-        fn=save_llm_output,
-        inputs=[selected_base_llm_name_dropdown, selected_base_llm_output, saved_llm_outputs],
-        outputs=saved_llm_outputs,
-    )
-    base_llm_outputs_clear_all_button.click(
-        fn=lambda: [{}, ""],
-        inputs=[],
-        outputs=[saved_llm_outputs, selected_base_llm_output],
-    )
-    base_llm_outputs_clear_single_button.click(
-        fn=lambda: "",
-        inputs=[],
-        outputs=selected_base_llm_output,
-    )
-    rank_button.click(
-        fn=check_save_ranker_inputs,
-        inputs=[inst_textbox, input_textbox, saved_llm_outputs, blender_config],
-        outputs=blender_state,
-    ).success(
-        fn=llms_rank,
-        inputs=[inst_textbox, input_textbox, saved_llm_outputs, blender_config],
-        outputs=[saved_rank_outputs, rank_outputs],
-    )
-    fuse_button.click(
-        fn=check_fuser_inputs,
-        inputs=[blender_state, blender_config, saved_rank_outputs],
-        outputs=fuser_outputs,
-    ).success(
-        fn=llms_fuse,
-        inputs=[blender_state, blender_config, saved_rank_outputs],
-        outputs=[saved_fuse_outputs, fuser_outputs],
-    )
-    clear_button.click(
-        fn=lambda: ["", "", {}, []],
-        inputs=[],
-        outputs=[rank_outputs, fuser_outputs, blender_state, saved_rank_outputs],
-    )
-    # update blender config
-    source_max_length.change(
-        fn=lambda x, y: y.update({"source_max_length": x}) or y,
-        inputs=[source_max_length, blender_config],
-        outputs=blender_config,
-    )
-    candidate_max_length.change(
-        fn=lambda x, y: y.update({"candidate_max_length": x}) or y,
-        inputs=[candidate_max_length, blender_config],
-        outputs=blender_config,
-    )
-    top_k_for_fuser.change(
-        fn=lambda x, y: y.update({"top_k_for_fuser": x}) or y,
-        inputs=[top_k_for_fuser, blender_config],
-        outputs=blender_config,
-    )
-    max_new_tokens.change(
-        fn=lambda x, y: y.update({"max_new_tokens": x}) or y,
-        inputs=[max_new_tokens, blender_config],
-        outputs=blender_config,
-    )
-    # temperature.change(
-    #     fn=lambda x, y: y.update({"temperature": x}) or y,
-    #     inputs=[temperature, blender_config],
-    #     outputs=blender_config,
-    # )
-    # top_p.change(
-    #     fn=lambda x, y: y.update({"top_p": x}) or y,
-    #     inputs=[top_p, blender_config],
-    #     outputs=blender_config,
-    # )
-    beam_size.change(
-        fn=lambda x, y: y.update({"num_beams": x}) or y,
-        inputs=[beam_size, blender_config],
-        outputs=blender_config,
-    )
 demo.queue(max_size=20).launch()

 import gradio as gr
 import sys
 import os
+import random
+import llm_blender
+import descriptions
 from datasets import load_dataset
+from llm_blender.blender.blender_utils import get_topk_candidates_from_ranks
 from typing import List
 MAX_BASE_LLM_NUM = 20
 MIN_BASE_LLM_NUM = 3
 SOURCE_MAX_LENGTH = 256
 DEFAULT_CANDIDATE_MAX_LENGTH = 128
 FUSER_MAX_NEW_TOKENS = 512
 DEFAULT_FUSER_MAX_NEW_TOKENS = 256
 EXAMPLES_DATASET = load_dataset("llm-blender/mix-instruct", split='validation', streaming=True)
 SHUFFLED_EXAMPLES_DATASET = EXAMPLES_DATASET.shuffle(seed=42, buffer_size=1000)
 EXAMPLES = []
     ])
     CANDIDATE_EXAMPLES[example['instruction']+example['input']] = example['candidates']
+HHH_EXAMPLES = []
+subsets = ['harmless', 'helpful', 'honest', 'other']
+random.seed(42)
+for subset in subsets:
+    dataset = load_dataset("HuggingFaceH4/hhh_alignment", subset)
+    for example in dataset['test']:
+        if random.random() < 0.5:
+            HHH_EXAMPLES.append([
+                subset,
+                example['input'],
+                example['targets']['choices'][0],
+                example['targets']['choices'][1],
+                "Response 1" if example['targets']['labels'][0] == 1 else "Response 2",
+            ])
+        else:
+            HHH_EXAMPLES.append([
+                subset,
+                example['input'],
+                example['targets']['choices'][1],
+                example['targets']['choices'][0],
+                "Response 2" if example['targets']['labels'][0] == 1 else "Response 1",
+            ])
+def get_hhh_examples(subset, instruction, response1, response2, dummy_text):
+    return instruction, response1, response2
+MT_BENCH_HUMAN_JUDGE_EXAMPLES = []
+dataset = load_dataset("lmsys/mt_bench_human_judgments")
+for example in dataset['human']:
+    if example['turn'] != 1:
+        continue
+    MT_BENCH_HUMAN_JUDGE_EXAMPLES.append([
+        example['model_a'],
+        example['model_b'],
+        str(example['conversation_a']),
+        str(example['conversation_b']),
+        "Model A" if example['winner'] == 'model_a' else "Model B",
+    ])
+def get_mt_bench_human_judge_examples(model_a, model_b, conversation_a, conversation_b, dummy_text):
+    chat_history_a = []
+    chat_history_b = []
+    conversation_a = eval(conversation_a)
+    conversation_b = eval(conversation_b)
+    for i in range(0, len(conversation_a), 2):
+        chat_history_a.append((conversation_a[i]['content'], conversation_a[i+1]['content']))
+        assert conversation_a[i]['role'] == 'user' and conversation_a[i+1]['role'] == 'assistant'
+    for i in range(0, len(conversation_b), 2):
+        chat_history_b.append((conversation_b[i]['content'], conversation_b[i+1]['content']))
+        assert conversation_b[i]['role'] == 'user' and conversation_b[i+1]['role'] == 'assistant'
+    return chat_history_a, chat_history_b
+blender = llm_blender.Blender()
+blender.loadranker("llm-blender/PairRM")
+blender.loadfuser("llm-blender/gen_fuser_3b")
 def update_base_llms_num(k, llm_outputs):
     k = int(k)
+    return [gr.Dropdown(choices=[f"LLM-{i+1}" for i in range(k)],
         value=f"LLM-1" if k >= 1 else "", visible=True),
         {f"LLM-{i+1}": llm_outputs.get(f"LLM-{i+1}", "") for i in range(k)}]
 def display_llm_output(llm_outputs, selected_base_llm_name):
+    return gr.Textbox(value=llm_outputs.get(selected_base_llm_name, ""),
         label=selected_base_llm_name + " (Click Save to save current content)",
         placeholder=f"Enter {selected_base_llm_name} output here", show_label=True)
 def save_llm_output(selected_base_llm_name, selected_base_llm_output, llm_outputs):
+    llm_outputs({selected_base_llm_name: selected_base_llm_output})
     return llm_outputs
 def get_preprocess_examples(inst, input):
 with gr.Blocks(theme='ParityError/Anime') as demo:
+    with gr.Tab("LLM-Blender"):
+        # llm-blender interface
+        with gr.Row():
+            gr.Markdown(descriptions.LLM_BLENDER_OVERALL_DESC)
+            gr.Image("https://github.com/yuchenlin/LLM-Blender/blob/main/docs/llm_blender.png?raw=true", height=300)
+        gr.Markdown("## Input and Base LLMs")
+        with gr.Row():
+            with gr.Column():
+                inst_textbox = gr.Textbox(lines=1, label="Instruction", placeholder="Enter instruction here", show_label=True)
+                input_textbox = gr.Textbox(lines=4, label="Input Context", placeholder="Enter input context here", show_label=True)
+            with gr.Column():
+                saved_llm_outputs = gr.State(value={})
+                with gr.Group():
+                    selected_base_llm_name_dropdown = gr.Dropdown(label="Base LLM",
+                        choices=[f"LLM-{i+1}" for i in range(MIN_BASE_LLM_NUM)], value="LLM-1", show_label=True)
+                    selected_base_llm_output = gr.Textbox(lines=4, label="LLM-1 (Click Save to save current content)",
+                        placeholder="Enter LLM-1 output here", show_label=True)
+                with gr.Row():
+                    base_llm_outputs_save_button = gr.Button('Save', variant='primary')
+                    base_llm_outputs_clear_single_button = gr.Button('Clear Single', variant='primary')
+                    base_llm_outputs_clear_all_button = gr.Button('Clear All', variant='primary')
+                base_llms_num = gr.Slider(
+                        label='Number of base llms',
+                        minimum=MIN_BASE_LLM_NUM,
+                        maximum=MAX_BASE_LLM_NUM,
+                        step=1,
+                        value=MIN_BASE_LLM_NUM,
+                    )
+        blender_state = gr.State(value={})
+        saved_rank_outputs = gr.State(value=[])
+        saved_fuse_outputs = gr.State(value=[])
+        gr.Markdown("## Blender Outputs")
+        with gr.Group():
+            rank_outputs = gr.Textbox(lines=1, label="Ranking outputs", placeholder="Ranking outputs", show_label=True)
+            fuser_outputs = gr.Textbox(lines=4, label="Fusing outputs", placeholder="Fusing outputs", show_label=True)
+        with gr.Row():
+            rank_button = gr.Button('Rank LLM Outputs', variant='primary')
+            fuse_button = gr.Button('Fuse Top-K ranked outputs', variant='primary')
+            clear_button = gr.Button('Clear Blender Outputs', variant='primary')
+        blender_config = gr.State(value={
+            "source_max_length": DEFAULT_SOURCE_MAX_LENGTH,
+            "candidate_max_length": DEFAULT_CANDIDATE_MAX_LENGTH,
+            "top_k_for_fuser": 3,
+            "max_new_tokens": DEFAULT_FUSER_MAX_NEW_TOKENS,
+            "temperature": 0.7,
+            "top_p": 1.0,
+        })
+        with gr.Accordion(label='Advanced options', open=False):
+            source_max_length = gr.Slider(
+                label='Max length of Instruction + Input',
+                minimum=1,
+                maximum=SOURCE_MAX_LENGTH,
+                step=1,
+                value=DEFAULT_SOURCE_MAX_LENGTH,
+            )
+            candidate_max_length = gr.Slider(
+                label='Max length of LLM-Output Candidate',
+                minimum=1,
+                maximum=CANDIDATE_MAX_LENGTH,
+                step=1,
+                value=DEFAULT_CANDIDATE_MAX_LENGTH,
+            )
+            top_k_for_fuser = gr.Slider(
+                label='Top-k ranked candidates to fuse',
+                minimum=1,
+                maximum=3,
+                step=1,
+                value=3,
+            )
+            max_new_tokens = gr.Slider(
+                label='Max new tokens fuser can generate',
+                minimum=1,
+                maximum=FUSER_MAX_NEW_TOKENS,
+                step=1,
+                value=DEFAULT_FUSER_MAX_NEW_TOKENS,
+            )
+            temperature = gr.Slider(
+                label='Temperature of fuser generation',
+                minimum=0.1,
+                maximum=2.0,
+                step=0.1,
+                value=0.7,
+            )
+            top_p = gr.Slider(
+                label='Top-p of fuser generation',
+                minimum=0.05,
+                maximum=1.0,
+                step=0.05,
+                value=1.0,
+            )
+        examples_dummy_textbox = gr.Textbox(lines=1, label="", placeholder="", show_label=False, visible=False)
+        batch_examples = gr.Examples(
+            examples=EXAMPLES,
+            fn=get_preprocess_examples,
+            cache_examples=True,
+            examples_per_page=5,
+            inputs=[inst_textbox, input_textbox],
+            outputs=[inst_textbox, input_textbox, base_llms_num, examples_dummy_textbox],
+        )
+        base_llms_num.change(
+            fn=update_base_llms_num,
+            inputs=[base_llms_num, saved_llm_outputs],
+            outputs=[selected_base_llm_name_dropdown, saved_llm_outputs],
+        )
+        examples_dummy_textbox.change(
+            fn=update_base_llm_dropdown_along_examples,
+            inputs=[examples_dummy_textbox],
+            outputs=[saved_llm_outputs, rank_outputs, fuser_outputs],
+        ).then(
+            fn=display_llm_output,
+            inputs=[saved_llm_outputs, selected_base_llm_name_dropdown],
+            outputs=selected_base_llm_output,
         )
+        selected_base_llm_name_dropdown.change(
+            fn=display_llm_output,
+            inputs=[saved_llm_outputs, selected_base_llm_name_dropdown],
+            outputs=selected_base_llm_output,
         )
+        base_llm_outputs_save_button.click(
+            fn=save_llm_output,
+            inputs=[selected_base_llm_name_dropdown, selected_base_llm_output, saved_llm_outputs],
+            outputs=saved_llm_outputs,
         )
+        base_llm_outputs_clear_all_button.click(
+            fn=lambda: [{}, ""],
+            inputs=[],
+            outputs=[saved_llm_outputs, selected_base_llm_output],
         )
+        base_llm_outputs_clear_single_button.click(
+            fn=lambda: "",
+            inputs=[],
+            outputs=selected_base_llm_output,
+        )
+        rank_button.click(
+            fn=check_save_ranker_inputs,
+            inputs=[inst_textbox, input_textbox, saved_llm_outputs, blender_config],
+            outputs=blender_state,
+        ).success(
+            fn=llms_rank,
+            inputs=[inst_textbox, input_textbox, saved_llm_outputs, blender_config],
+            outputs=[saved_rank_outputs, rank_outputs],
         )
+        fuse_button.click(
+            fn=check_fuser_inputs,
+            inputs=[blender_state, blender_config, saved_rank_outputs],
+            outputs=[],
+        ).success(
+            fn=llms_fuse,
+            inputs=[blender_state, blender_config, saved_rank_outputs],
+            outputs=[saved_fuse_outputs, fuser_outputs],
+        )
+        clear_button.click(
+            fn=lambda: ["", "", {}, []],
+            inputs=[],
+            outputs=[rank_outputs, fuser_outputs, blender_state, saved_rank_outputs],
+        )
+        # update blender config
+        source_max_length.change(
+            fn=lambda x, y: y.update({"source_max_length": x}) or y,
+            inputs=[source_max_length, blender_config],
+            outputs=blender_config,
+        )
+        candidate_max_length.change(
+            fn=lambda x, y: y.update({"candidate_max_length": x}) or y,
+            inputs=[candidate_max_length, blender_config],
+            outputs=blender_config,
+        )
+        top_k_for_fuser.change(
+            fn=lambda x, y: y.update({"top_k_for_fuser": x}) or y,
+            inputs=[top_k_for_fuser, blender_config],
+            outputs=blender_config,
+        )
+        max_new_tokens.change(
+            fn=lambda x, y: y.update({"max_new_tokens": x}) or y,
+            inputs=[max_new_tokens, blender_config],
+            outputs=blender_config,
+        )
+        temperature.change(
+            fn=lambda x, y: y.update({"temperature": x}) or y,
+            inputs=[temperature, blender_config],
+            outputs=blender_config,
+        )
+        top_p.change(
+            fn=lambda x, y: y.update({"top_p": x}) or y,
+            inputs=[top_p, blender_config],
+            outputs=blender_config,
+        )
+    with gr.Tab("PairRM"):
+        # PairRM interface
+        with gr.Row():
+            gr.Markdown(descriptions.PairRM_OVERALL_DESC)
+            gr.Image("https://yuchenlin.xyz/LLM-Blender/pairranker.png")
+        with gr.Tab("Compare two responses"):
+            instruction = gr.Textbox(lines=1, label="Instruction", placeholder="Enter instruction here", show_label=True)
+            with gr.Row():
+                response1 = gr.Textbox(lines=4, label="Response 1", placeholder="Enter response 1 here", show_label=True)
+                response2 = gr.Textbox(lines=4, label="Response 2", placeholder="Enter response 2 here", show_label=True)
+            with gr.Row():
+                compare_button = gr.Button('Compare', variant='primary')
+                clear_button = gr.Button('Clear', variant='primary')
+            with gr.Row():
+                compare_result = gr.Textbox(lines=1, label="Compare Result", placeholder="", show_label=True)
+                compare_result_prob = gr.Textbox(lines=1, label="PairRM Confidence", placeholder="", show_label=True)
+            def compare_fn(inst, response1, response2):
+                if not inst:
+                    raise gr.Error("Please enter instruction")
+                if not response1 or not response2:
+                    raise gr.Error("Please enter response 1 and response 2")
+                comparison_results = blender.compare([inst], [response1], [response2], return_logits=True)
+                logit = comparison_results[0]
+                if logit > 0:
+                    result = "Response 1 is better than Response 2"
+                    prob = f"Confidence: {round(logit, 2)}"
+                elif logit < 0:
+                    result = "Response 2 is better than Response 1"
+                    prob = f"Cofidence: {round(abs(logit), 2)}"
+                else:
+                    result = "Response 1 and Response 2 are equally good"
+                    prob = f"No confidence for tie"
+                return [result, prob]
+            compare_button.click(
+                fn=compare_fn,
+                inputs=[instruction, response1, response2],
+                outputs=[compare_result, compare_result_prob],
+            )
+            clear_button.click(
+                fn=lambda: ["", ""],
+                inputs=[],
+                outputs=[compare_result, compare_result_prob],
+            )
+            hhh_dummy_textbox1 = gr.Textbox(lines=1, label="subset", placeholder="", show_label=False, visible=False)
+            hhh_dummy_textbox2 = gr.Textbox(lines=1, label="Better Response", placeholder="", show_label=False, visible=False)
+            gr.Markdown("## Examples from [HuggingFaceH4/hhh_alignment](https://huggingface.co/datasets/HuggingFaceH4/hhh_alignment)")
+            gr.Examples(
+                HHH_EXAMPLES,
+                fn=get_hhh_examples,
+                cache_examples=True,
+                examples_per_page=5,
+                inputs=[hhh_dummy_textbox1, instruction, response1, response2, hhh_dummy_textbox2],
+                outputs=[instruction, response1, response2],
+            )
+        with gr.Tab("Compare assistant's response in two multi-turn conversations"):
+            gr.Markdown("NOTE: Comparison of two conversations is based on that the user query in each turn is the same of two conversations.")
+            def append_message(message, chat_history):
+                if not message:
+                    return "", chat_history
+                if len(chat_history) == 0:
+                    chat_history.append((message, "(Please enter your bot response)"))
+                else:
+                    if chat_history[-1][1] == "(Please enter your bot response)":
+                        chat_history[-1] = (chat_history[-1][0], message)
+                    else:
+                        chat_history.append((message, "(Please enter your bot response)"))
+                return "", chat_history
+            with gr.Row():
+                with gr.Column():
+                    gr.Markdown("### Conversation A")
+                    chatbot1 = gr.Chatbot()
+                    msg1 = gr.Textbox(lines=1, label="Enter Chat history for Conversation A", placeholder="Enter your message here", show_label=True)
+                    clear1 = gr.ClearButton([msg1, chatbot1])
+                    msg1.submit(append_message, [msg1, chatbot1], [msg1, chatbot1])
+                with gr.Column():
+                    gr.Markdown("### Conversation B")
+                    chatbot2 = gr.Chatbot()
+                    msg2 = gr.Textbox(lines=1, label="Enter Chat history for Conversation B", placeholder="Enter your message here", show_label=True)
+                    clear2 = gr.ClearButton([msg2, chatbot2])
+                    msg2.submit(append_message, [msg2, chatbot2], [msg2, chatbot2])
+            with gr.Row():
+                compare_button = gr.Button('Compare', variant='primary')
+            with gr.Row():
+                compare_result = gr.Textbox(lines=1, label="Compare Result", placeholder="", show_label=True)
+                compare_result_prob = gr.Textbox(lines=1, label="PairRM Confidence", placeholder="", show_label=True)
+            def compare_conv_fn(chat_history1, chat_history2):
+                if len(chat_history1) == 0 or len(chat_history2) == 0:
+                    raise gr.Error("Please enter chat history for both conversations")
+                assert chat_history1[-1][1] != "(Please enter your bot response)" \
+                    and chat_history2[-1][1] != "(Please enter your bot response)", \
+                    "Please complete chat history for both conversations"
+                chat1_messages = []
+                for item in chat_history1:
+                    chat1_messages.append({
+                        "role": "USER",
+                        "content": item[0],
+                    })
+                    chat1_messages.append({
+                        "role": "ASSISTANT",
+                        "content": item[1],
+                    })
+                chat2_messages = []
+                for item in chat_history2:
+                    chat2_messages.append({
+                        "role": "USER",
+                        "content": item[0],
+                    })
+                    chat2_messages.append({
+                        "role": "ASSISTANT",
+                        "content": item[1],
+                    })
+                comparison_results = blender.compare_conversations([chat1_messages], [chat2_messages], return_logits=True)
+                logit = comparison_results[0]
+                if logit > 0:
+                    result = "Assistant's response in Conversation A is better than Conversation B"
+                    prob = f"Confidence: {round(logit, 2)}"
+                elif logit < 0:
+                    result = "Assistant's response in Conversation B is better than Conversation A"
+                    prob = f"Cofidence: {round(abs(logit), 2)}"
+                else:
+                    result = "Assistant's response in Conversation A and Conversation B are equally good"
+                    prob = f"No confidence for tie"
+                return [result, prob]
+            compare_button.click(
+                fn=compare_conv_fn,
+                inputs=[chatbot1, chatbot2],
+                outputs=[compare_result, compare_result_prob],
+            )
+            model_a_dummy_textbox = gr.Textbox(lines=1, label="Model A", placeholder="", show_label=False, visible=False)
+            model_b_dummy_textbox = gr.Textbox(lines=1, label="Model B", placeholder="", show_label=False, visible=False)
+            winner_dummy_textbox = gr.Textbox(lines=1, label="Better Model in conversation", placeholder="", show_label=False, visible=False)
+            chatbot1_dummy_textbox = gr.Textbox(lines=1, label="Conversation A", placeholder="", show_label=False, visible=False)
+            chatbot2_dummy_textbox = gr.Textbox(lines=1, label="Conversation B", placeholder="", show_label=False, visible=False)
+            gr.Markdown("## Examples from [lmsys/mt_bench_human_judgments](https://huggingface.co/datasets/lmsys/mt_bench_human_judgments)")
+            gr.Examples(
+                MT_BENCH_HUMAN_JUDGE_EXAMPLES,
+                fn=get_mt_bench_human_judge_examples,
+                cache_examples=True,
+                examples_per_page=5,
+                inputs=[model_a_dummy_textbox, model_b_dummy_textbox, chatbot1_dummy_textbox, chatbot2_dummy_textbox, winner_dummy_textbox],
+                outputs=[chatbot1, chatbot2],
+            )
+    gr.Markdown(descriptions.CITATION)
 demo.queue(max_size=20).launch()

descriptions.py ADDED Viewed

	@@ -0,0 +1,49 @@

+LLM_BLENDER_OVERALL_DESC = """
+LLM-Blender is an innovative ensembling framework to attain consistently superior performance by leveraging the diverse strengths of multiple open-source large language models (LLMs).
+LLM-Blender cut the weaknesses through ranking and integrate the strengths through fusing generation to enhance the capability of LLMs.
+Our framework consists of two complementary modules: **PairRanker** and **GenFuser**,
+addressing the observation that optimal LLMs for different examples can significantly vary.
+**PairRanker** employs a specialized pairwise comparison method to distinguish subtle differences between candidate outputs.
+**GenFuser** aims to merge the top-ranked candidates from the aggregation of PairRanker's pairwise
+comparisons into an improved output by capitalizing on their strengths and mitigating their weaknesses.
+| [Paper](https://arxiv.org/abs/2306.02561)
+| [Code](https://github.com/yuchenlin/LLM-Blender)
+| [Dataset](https://huggingface.co/datasets/llm-blender/mix-instruct)
+| [Models](https://huggingface.co/llm-blender)
+| [Tweet](https://twitter.com/billyuchenlin/status/1668666357058277377)
+Try LLM-Blender now! 👇
+"""
+PairRM_OVERALL_DESC = """## 🤗 [PairRM](https://huggingface.co/llm-blender/PairRM)
+PairRM is a reward model based on PairRanker architecture that has been trained on various high-quality and
+large-scale dataset with human preference annotations and exhibits great correlation with human preferences.
+**While PairRM is a extremely small model (0.4B), our tests on various human alignment benchmarks show approaching performance of GPT-4.** (See [PairRM](https://huggingface.co/llm-blender/PairRM) for more detail results)
+PairRM could be easily applied to 3 scenarios:
+1. [Directly compare two responses or two conversations](https://huggingface.co/llm-blender/PairRM#use-case-1-compare-responses-quality-evaluator)
+2. [Do best-of-n sampling to enhancing the ability of LLM](https://huggingface.co/llm-blender/PairRM#use-case-2-best-of-n-sampling-decoding-enhancing)
+3. [RLHF alignment](https://huggingface.co/llm-blender/PairRM#use-case-3-rlhf)
+This demo allows user to interact with PairRM in the first scenario. Feel free to compare the quality of two responses or two conversations by inputting them in the text box below.
+Try PairRM now! 👇
+"""
+CITATION = """
+## Citation
+```bibtex
+@inproceedings{llm-blender-2023,
+    title = "LLM-Blender: Ensembling Large Language Models with Pairwise Comparison and Generative Fusion",
+    author = "Jiang, Dongfu and Ren, Xiang and Lin, Bill Yuchen",
+    booktitle = "Proceedings of the 61th Annual Meeting of the Association for Computational Linguistics (ACL 2023)",
+    year = "2023"
+}
+```
+"""