Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,142 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import pandas as pd
|
3 |
+
import random
|
4 |
+
from datasets import load_dataset, Dataset, DatasetDict
|
5 |
+
from huggingface_hub import HfApi, login
|
6 |
+
import os
|
7 |
+
from datetime import datetime
|
8 |
+
|
9 |
+
hf_api = HfApi()
|
10 |
+
HF_TOKEN = os.getenv('HF_TOKEN')
|
11 |
+
login(token=HF_TOKEN)
|
12 |
+
|
13 |
+
|
14 |
+
dataset_1 = load_dataset("Q-bert/LlamaPair")["train"]
|
15 |
+
dataset_2 = load_dataset("Q-bert/QwenPair")["train"]
|
16 |
+
dataset_3 = load_dataset("Q-bert/MistralPair")["train"]
|
17 |
+
|
18 |
+
df_log = pd.DataFrame(columns=["instruction", "selected_model", "pair", "submission_time"])
|
19 |
+
|
20 |
+
def get_random_row():
|
21 |
+
selected_dataset = random.choice([dataset_1, dataset_2, dataset_3])
|
22 |
+
pair_name = ("LlamaPair" if selected_dataset == dataset_1
|
23 |
+
else "QwenPair" if selected_dataset == dataset_2
|
24 |
+
else "MistralPair")
|
25 |
+
|
26 |
+
row = selected_dataset[random.randint(0, len(selected_dataset) - 1)]
|
27 |
+
instruction = row["instruction"]
|
28 |
+
response_human = row["response_human_like_model"]
|
29 |
+
response_official = row["response_offical_instruct_model"]
|
30 |
+
|
31 |
+
responses = [("Human-like Model", response_human),
|
32 |
+
("Official Model", response_official)]
|
33 |
+
random.shuffle(responses)
|
34 |
+
|
35 |
+
return (instruction, responses[0][1], responses[1][1],
|
36 |
+
responses[0][0], responses[1][0], pair_name)
|
37 |
+
|
38 |
+
def format_response_html(response):
|
39 |
+
return f'''
|
40 |
+
<div style="border: 1px solid white; background-color: black;
|
41 |
+
padding: 10px; margin: 5px;">
|
42 |
+
<strong style="color: white;">Answer:</strong>
|
43 |
+
<div style="color: white;">{response}</div>
|
44 |
+
</div>
|
45 |
+
'''
|
46 |
+
|
47 |
+
def submit_choice(selected_response, instruction, label_1, label_2, pair_name):
|
48 |
+
try:
|
49 |
+
df_log = pd.DataFrame(load_dataset("Q-bert/log")["train"])
|
50 |
+
except:
|
51 |
+
df_log = pd.DataFrame(columns=["instruction", "selected_model",
|
52 |
+
"pair", "submission_time"])
|
53 |
+
|
54 |
+
selected_model = label_1 if selected_response == "Answer 1" else label_2
|
55 |
+
submission_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
|
56 |
+
|
57 |
+
new_entry = pd.DataFrame({
|
58 |
+
"instruction": [instruction],
|
59 |
+
"selected_model": [selected_model],
|
60 |
+
"pair": [pair_name],
|
61 |
+
"submission_time": [submission_time]
|
62 |
+
})
|
63 |
+
|
64 |
+
df_log = pd.concat([df_log, new_entry], ignore_index=True)
|
65 |
+
df_log.to_csv("annotations_log.csv", index=False)
|
66 |
+
log = Dataset.from_pandas(df_log)
|
67 |
+
log.push_to_hub("Q-bert/log")
|
68 |
+
|
69 |
+
new_instruction, new_response_1, new_response_2, new_label_1, new_label_2, new_pair_name = get_random_row()
|
70 |
+
|
71 |
+
return (
|
72 |
+
f"### Question:\n{new_instruction}",
|
73 |
+
format_response_html(new_response_1),
|
74 |
+
format_response_html(new_response_2),
|
75 |
+
new_label_1,
|
76 |
+
new_label_2,
|
77 |
+
new_pair_name,
|
78 |
+
"Your choice has been recorded. A new question is loaded!"
|
79 |
+
)
|
80 |
+
|
81 |
+
def create_interface():
|
82 |
+
instruction, response_1, response_2, label_1, label_2, pair_name = get_random_row()
|
83 |
+
|
84 |
+
with gr.Blocks(theme=gr.themes.Default()) as demo:
|
85 |
+
gr.Markdown("# Human-Likeness Voting System")
|
86 |
+
gr.Markdown("![image/png](https://cdn-uploads.huggingface.co/production/uploads/63da3d7ae697e5898cb86854/6vL52mOW6IqZu8DFlAZ4C.png)")
|
87 |
+
gr.Markdown("This interface has been created to compare the performance of the human-like LLMs developed by our team with the models on which they were trained. The results of this study will be presented in a paper. Please ensure that your responses are fair and accurate when casting your vote and selecting the appropriate answer. We thank you for your contributions on behalf of the research team.")
|
88 |
+
gr.Markdown("## Instructions")
|
89 |
+
gr.Markdown(
|
90 |
+
"""
|
91 |
+
1. First, read the provided question carefully.
|
92 |
+
2. Second, read both responses carefully.
|
93 |
+
3. Finally, select the model that best resembles a human in terms of response quality."""
|
94 |
+
)
|
95 |
+
current_instruction = gr.State(instruction)
|
96 |
+
label_1_state = gr.State(label_1)
|
97 |
+
label_2_state = gr.State(label_2)
|
98 |
+
pair_name_state = gr.State(pair_name)
|
99 |
+
question_display = gr.Markdown(value=f"### Question:\n{instruction}")
|
100 |
+
with gr.Row():
|
101 |
+
with gr.Column():
|
102 |
+
response_1_display = gr.HTML(format_response_html(response_1))
|
103 |
+
with gr.Column():
|
104 |
+
response_2_display = gr.HTML(format_response_html(response_2))
|
105 |
+
with gr.Row():
|
106 |
+
selected_response = gr.Radio(
|
107 |
+
["Answer 1", "Answer 2"],
|
108 |
+
label="Which answer is better?",
|
109 |
+
interactive=True
|
110 |
+
)
|
111 |
+
submit_btn = gr.Button("Submit Choice")
|
112 |
+
|
113 |
+
status_output = gr.Textbox(
|
114 |
+
interactive=False,
|
115 |
+
label="Status",
|
116 |
+
value="Select an answer and click Submit"
|
117 |
+
)
|
118 |
+
submit_btn.click(
|
119 |
+
fn=submit_choice,
|
120 |
+
inputs=[
|
121 |
+
selected_response,
|
122 |
+
current_instruction,
|
123 |
+
label_1_state,
|
124 |
+
label_2_state,
|
125 |
+
pair_name_state
|
126 |
+
],
|
127 |
+
outputs=[
|
128 |
+
question_display,
|
129 |
+
response_1_display,
|
130 |
+
response_2_display,
|
131 |
+
label_1_state,
|
132 |
+
label_2_state,
|
133 |
+
pair_name_state,
|
134 |
+
status_output
|
135 |
+
]
|
136 |
+
)
|
137 |
+
|
138 |
+
return demo
|
139 |
+
|
140 |
+
if __name__ == "__main__":
|
141 |
+
interface = create_interface()
|
142 |
+
interface.launch(share=True)
|