Spaces:

HumanLLMs
/

Human-Likeness_Voting_System

Running

App Files Files Community

Q-bert commited on 16 days ago

Commit

61261dd

•

1 Parent(s): 182df4c

Create app.py

Browse files

Files changed (1) hide show

app.py +142 -0

app.py ADDED Viewed

	@@ -0,0 +1,142 @@

+import gradio as gr
+import pandas as pd
+import random
+from datasets import load_dataset, Dataset, DatasetDict
+from huggingface_hub import HfApi, login
+import os
+from datetime import datetime
+hf_api = HfApi()
+HF_TOKEN = os.getenv('HF_TOKEN')
+login(token=HF_TOKEN)
+dataset_1 = load_dataset("Q-bert/LlamaPair")["train"]
+dataset_2 = load_dataset("Q-bert/QwenPair")["train"]
+dataset_3 = load_dataset("Q-bert/MistralPair")["train"]
+df_log = pd.DataFrame(columns=["instruction", "selected_model", "pair", "submission_time"])
+def get_random_row():
+    selected_dataset = random.choice([dataset_1, dataset_2, dataset_3])
+    pair_name = ("LlamaPair" if selected_dataset == dataset_1
+                 else "QwenPair" if selected_dataset == dataset_2
+                 else "MistralPair")
+    row = selected_dataset[random.randint(0, len(selected_dataset) - 1)]
+    instruction = row["instruction"]
+    response_human = row["response_human_like_model"]
+    response_official = row["response_offical_instruct_model"]
+    responses = [("Human-like Model", response_human),
+                ("Official Model", response_official)]
+    random.shuffle(responses)
+    return (instruction, responses[0][1], responses[1][1],
+            responses[0][0], responses[1][0], pair_name)
+def format_response_html(response):
+    return f'''
+        <div style="border: 1px solid white; background-color: black;
+                    padding: 10px; margin: 5px;">
+            <strong style="color: white;">Answer:</strong>
+            <div style="color: white;">{response}</div>
+        </div>
+    '''
+def submit_choice(selected_response, instruction, label_1, label_2, pair_name):
+    try:
+        df_log = pd.DataFrame(load_dataset("Q-bert/log")["train"])
+    except:
+        df_log = pd.DataFrame(columns=["instruction", "selected_model",
+                                     "pair", "submission_time"])
+    selected_model = label_1 if selected_response == "Answer 1" else label_2
+    submission_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
+    new_entry = pd.DataFrame({
+        "instruction": [instruction],
+        "selected_model": [selected_model],
+        "pair": [pair_name],
+        "submission_time": [submission_time]
+    })
+    df_log = pd.concat([df_log, new_entry], ignore_index=True)
+    df_log.to_csv("annotations_log.csv", index=False)
+    log = Dataset.from_pandas(df_log)
+    log.push_to_hub("Q-bert/log")
+    new_instruction, new_response_1, new_response_2, new_label_1, new_label_2, new_pair_name = get_random_row()
+    return (
+        f"### Question:\n{new_instruction}",
+        format_response_html(new_response_1),
+        format_response_html(new_response_2),
+        new_label_1,
+        new_label_2,
+        new_pair_name,
+        "Your choice has been recorded. A new question is loaded!"
+    )
+def create_interface():
+    instruction, response_1, response_2, label_1, label_2, pair_name = get_random_row()
+    with gr.Blocks(theme=gr.themes.Default()) as demo:
+        gr.Markdown("# Human-Likeness Voting System")
+        gr.Markdown("![image/png](https://cdn-uploads.huggingface.co/production/uploads/63da3d7ae697e5898cb86854/6vL52mOW6IqZu8DFlAZ4C.png)")
+        gr.Markdown("This interface has been created to compare the performance of the human-like LLMs developed by our team with the models on which they were trained. The results of this study will be presented in a paper. Please ensure that your responses are fair and accurate when casting your vote and selecting the appropriate answer. We thank you for your contributions on behalf of the research team.")
+        gr.Markdown("## Instructions")
+        gr.Markdown(
+            """
+             1. First, read the provided question carefully.
+             2. Second, read both responses carefully.
+             3. Finally, select the model that best resembles a human in terms of response quality."""
+        )
+        current_instruction = gr.State(instruction)
+        label_1_state = gr.State(label_1)
+        label_2_state = gr.State(label_2)
+        pair_name_state = gr.State(pair_name)
+        question_display = gr.Markdown(value=f"### Question:\n{instruction}")
+        with gr.Row():
+            with gr.Column():
+                response_1_display = gr.HTML(format_response_html(response_1))
+            with gr.Column():
+                response_2_display = gr.HTML(format_response_html(response_2))
+        with gr.Row():
+            selected_response = gr.Radio(
+                ["Answer 1", "Answer 2"],
+                label="Which answer is better?",
+                interactive=True
+            )
+            submit_btn = gr.Button("Submit Choice")
+        status_output = gr.Textbox(
+            interactive=False,
+            label="Status",
+            value="Select an answer and click Submit"
+        )
+        submit_btn.click(
+            fn=submit_choice,
+            inputs=[
+                selected_response,
+                current_instruction,
+                label_1_state,
+                label_2_state,
+                pair_name_state
+            ],
+            outputs=[
+                question_display,
+                response_1_display,
+                response_2_display,
+                label_1_state,
+                label_2_state,
+                pair_name_state,
+                status_output
+            ]
+        )
+        return demo
+if __name__ == "__main__":
+    interface = create_interface()
+    interface.launch(share=True)