File size: 7,274 Bytes
61261dd 6eb8685 61261dd eaac9a1 c5a32fe 8b5f978 c5a32fe d12c61c 6e511fa 61261dd 81ced9c 61261dd 8b5f978 6e511fa 8b5f978 6e511fa 61261dd 6e511fa c5a32fe 61261dd 81ced9c 61261dd d08d32b 61261dd d08d32b 6eb8685 d08d32b c5a32fe d08d32b 6eb8685 61261dd 805de22 61261dd 805de22 c5a32fe 6e511fa c5a32fe 61261dd 805de22 61261dd c5a32fe bfb0da9 c5a32fe 61261dd bfb0da9 61261dd 805de22 c5a32fe 805de22 c5a32fe 6e511fa 805de22 c5a32fe c953b7b 61261dd c5a32fe d08d32b 61261dd 003e27c c5b48e4 003e27c 61261dd c5a32fe 61261dd 003e27c 61261dd d08d32b 61261dd d08d32b 61261dd e421714 61261dd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 |
import gradio as gr
import pandas as pd
import random
from datasets import load_dataset, Dataset, DatasetDict
from huggingface_hub import HfApi, login
import os
from datetime import datetime
import markdown
hf_api = HfApi()
HF_TOKEN = os.getenv('HF_TOKEN')
login(token=HF_TOKEN)
log_dataset = "HumanLLMs/log"
selected_indices = set()
dataset_1 = load_dataset("HumanLLMs/LlamaPair")["train"]
dataset_2 = load_dataset("HumanLLMs/QwenPair")["train"]
dataset_3 = load_dataset("HumanLLMs/MistralPair")["train"]
df_log = pd.DataFrame(columns=["instruction", "selected_model", "pair", "submission_time"])
def remove_emojis(text):
return text.encode('ascii', 'ignore').decode('ascii')
def get_random_row():
global selected_indices
selected_dataset = random.choice([dataset_1, dataset_2, dataset_3])
pair_name = ("LlamaPair" if selected_dataset == dataset_1
else "QwenPair" if selected_dataset == dataset_2
else "MistralPair")
idx = random.randint(0, len(selected_dataset) - 1)
while (pair_name, idx) in selected_indices:
idx = random.randint(0, len(selected_dataset) - 1)
selected_indices.add((pair_name, idx))
row = selected_dataset[idx]
instruction = row["instruction"]
response_human = row["response_human_like_model"]
response_official = row["response_offical_instruct_model"]
responses = [("Human-like Model", response_human),
("Official Model", response_official)]
random.shuffle(responses)
return (instruction, remove_emojis(responses[0][1]), remove_emojis(responses[1][1]),
responses[0][0], responses[1][0], pair_name)
def format_response_1_html(response):
return f'''
<div style="border: 1px solid white; background-color: black;
padding: 10px; margin: 5px;">
<strong style="color: white;">Answer 1:</strong>
<div style="color: white;">{markdown.markdown(response)}</div>
</div>
'''
def format_response_2_html(response): ## Near duplicate
return f'''
<div style="border: 1px solid white; background-color: black;
padding: 10px; margin: 5px;">
<strong style="color: white;">Answer 2:</strong>
<div style="color: white;">{markdown.markdown(response)}</div>
</div>
'''
counter = 0
accumulated_log = pd.DataFrame(columns=["instruction", "selected_model", "pair", "submission_time"])
def submit_choice(selected_response, instruction, label_1, label_2, pair_name):
global counter, accumulated_log
try:
df_log = pd.DataFrame(load_dataset(log_dataset)["train"])
except:
df_log = pd.DataFrame(columns=["instruction", "selected_model",
"pair", "submission_time"])
selected_model = label_1 if selected_response == "Answer 1" else label_2
submission_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
new_instruction, new_response_1, new_response_2, new_label_1, new_label_2, new_pair_name = get_random_row()
new_entry = pd.DataFrame({
"instruction": [new_instruction],
"selected_model": [selected_model],
"pair": [pair_name],
"submission_time": [submission_time]
})
accumulated_log = pd.concat([accumulated_log, new_entry], ignore_index=True)
counter += 1
if counter % 10 == 0:
df_log = pd.concat([df_log, accumulated_log], ignore_index=True)
df_log.to_csv("annotations_log.csv", index=False)
accumulated_log = pd.DataFrame(columns=["instruction", "selected_model", "pair", "submission_time"])
log = Dataset.from_pandas(df_log)
log.push_to_hub(log_dataset)
question = f"""
<div style="text-align: center; font-size: 24px; font-weight: bold; margin-top: 20px;">
Question:
</div>
<div style="text-align: center; font-size: 20px; margin-top: 10px;">
{new_instruction}
</div>
"""
return (
question,
format_response_1_html(new_response_1),
format_response_2_html(new_response_2),
new_label_1,
new_label_2,
new_pair_name,
"Your choice has been recorded. A new question is loaded!"
)
def create_interface():
instruction, response_1, response_2, label_1, label_2, pair_name = get_random_row()
with gr.Blocks(theme=gr.themes.Default()) as demo:
gr.HTML("""
<div style="text-align: center;">
<h1>Human-Likeness Voting System</h1>
</div>
""")
gr.Markdown("This interface has been created to compare the performance of the human-like LLMs developed by our team with the models on which they were trained. The results of this study will be presented in a paper. Please ensure that your responses are fair and accurate when casting your vote and selecting the appropriate answer. We thank you for your contributions on behalf of the research team.")
gr.Markdown("## Instructions")
gr.Markdown(
"""
1. First, read the provided question carefully.
2. Second, read both responses carefully.
3. Finally, select the model that best resembles a human in terms of response quality."""
)
current_instruction = gr.State(instruction)
label_1_state = gr.State(label_1)
label_2_state = gr.State(label_2)
pair_name_state = gr.State(pair_name)
question_display = gr.HTML(
value=f"""
<div style="text-align: center; font-size: 24px; font-weight: bold; margin-top: 20px;">
Question:
</div>
<div style="text-align: center; font-size: 20px; margin-top: 10px;">
{instruction}
</div>
"""
)
with gr.Row():
with gr.Column():
response_1_display = gr.HTML(format_response_1_html(response_1))
with gr.Column():
response_2_display = gr.HTML(format_response_2_html(response_2))
with gr.Row():
selected_response = gr.Radio(
["Answer 1", "Answer 2"],
label="Which answer is better?",
interactive=True,
)
submit_btn = gr.Button("Submit Choice")
status_output = gr.Textbox(
interactive=False,
label="Status",
value="Select an answer and click Submit"
)
submit_btn.click(
fn=submit_choice,
inputs=[
selected_response,
current_instruction,
label_1_state,
label_2_state,
pair_name_state
],
outputs=[
question_display,
response_1_display,
response_2_display,
label_1_state,
label_2_state,
pair_name_state,
status_output
]
)
return demo
if __name__ == "__main__":
interface = create_interface()
interface.launch(share=True)
|