File size: 7,274 Bytes
61261dd
 
 
 
 
 
 
6eb8685
61261dd
 
 
 
 
eaac9a1
c5a32fe
8b5f978
c5a32fe
d12c61c
6e511fa
 
61261dd
 
 
81ced9c
 
 
61261dd
8b5f978
6e511fa
 
 
 
 
 
 
 
 
8b5f978
6e511fa
 
61261dd
 
 
 
 
6e511fa
c5a32fe
61261dd
 
81ced9c
61261dd
 
d08d32b
61261dd
 
 
d08d32b
6eb8685
d08d32b
 
c5a32fe
 
d08d32b
 
 
 
6eb8685
61261dd
 
 
805de22
 
 
61261dd
805de22
c5a32fe
6e511fa
c5a32fe
61261dd
 
805de22
61261dd
 
 
c5a32fe
bfb0da9
c5a32fe
61261dd
bfb0da9
61261dd
 
 
 
805de22
c5a32fe
805de22
c5a32fe
6e511fa
805de22
 
 
 
c5a32fe
 
 
c953b7b
 
 
 
 
 
 
61261dd
 
c5a32fe
d08d32b
 
61261dd
 
 
 
 
 
 
 
 
 
003e27c
 
 
c5b48e4
003e27c
 
61261dd
 
 
 
 
 
 
 
c5a32fe
61261dd
 
 
 
003e27c
 
 
 
 
 
 
 
 
 
 
61261dd
 
d08d32b
61261dd
d08d32b
61261dd
 
 
 
e421714
61261dd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
import gradio as gr
import pandas as pd
import random
from datasets import load_dataset, Dataset, DatasetDict
from huggingface_hub import HfApi, login
import os
from datetime import datetime
import markdown

hf_api = HfApi()
HF_TOKEN = os.getenv('HF_TOKEN')
login(token=HF_TOKEN)

log_dataset = "HumanLLMs/log"

selected_indices = set()

dataset_1 = load_dataset("HumanLLMs/LlamaPair")["train"]
dataset_2 = load_dataset("HumanLLMs/QwenPair")["train"]
dataset_3 = load_dataset("HumanLLMs/MistralPair")["train"]

df_log = pd.DataFrame(columns=["instruction", "selected_model", "pair", "submission_time"])

def remove_emojis(text):
    return text.encode('ascii', 'ignore').decode('ascii')

def get_random_row():
    global selected_indices

    selected_dataset = random.choice([dataset_1, dataset_2, dataset_3])
    pair_name = ("LlamaPair" if selected_dataset == dataset_1 
                 else "QwenPair" if selected_dataset == dataset_2 
                 else "MistralPair")
        
    idx = random.randint(0, len(selected_dataset) - 1)
    while (pair_name, idx) in selected_indices:
        idx = random.randint(0, len(selected_dataset) - 1)

    selected_indices.add((pair_name, idx))
    row = selected_dataset[idx]
    instruction = row["instruction"]
    response_human = row["response_human_like_model"]
    response_official = row["response_offical_instruct_model"]
    
    responses = [("Human-like Model", response_human), 
                 ("Official Model", response_official)]
    
    random.shuffle(responses)
    
    return (instruction, remove_emojis(responses[0][1]), remove_emojis(responses[1][1]), 
            responses[0][0], responses[1][0], pair_name)

def format_response_1_html(response):
    return f'''
        <div style="border: 1px solid white; background-color: black; 
                    padding: 10px; margin: 5px;">
            <strong style="color: white;">Answer 1:</strong>
            <div style="color: white;">{markdown.markdown(response)}</div>
        </div>
    '''
    
def format_response_2_html(response): ## Near duplicate
    return f'''
        <div style="border: 1px solid white; background-color: black; 
                    padding: 10px; margin: 5px;">
            <strong style="color: white;">Answer 2:</strong>
            <div style="color: white;">{markdown.markdown(response)}</div>
        </div>
    '''

counter = 0
accumulated_log = pd.DataFrame(columns=["instruction", "selected_model", "pair", "submission_time"])

def submit_choice(selected_response, instruction, label_1, label_2, pair_name):
    global counter, accumulated_log
    
    try:
        df_log = pd.DataFrame(load_dataset(log_dataset)["train"])
    except:
        df_log = pd.DataFrame(columns=["instruction", "selected_model", 
                                       "pair", "submission_time"])

    selected_model = label_1 if selected_response == "Answer 1" else label_2
    submission_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    
    new_instruction, new_response_1, new_response_2, new_label_1, new_label_2, new_pair_name = get_random_row()
    
    new_entry = pd.DataFrame({
        "instruction": [new_instruction],
        "selected_model": [selected_model],
        "pair": [pair_name],
        "submission_time": [submission_time]
    })
    accumulated_log = pd.concat([accumulated_log, new_entry], ignore_index=True)
    
    counter += 1
    
    if counter % 10 == 0:
        df_log = pd.concat([df_log, accumulated_log], ignore_index=True)
        df_log.to_csv("annotations_log.csv", index=False)
        accumulated_log = pd.DataFrame(columns=["instruction", "selected_model", "pair", "submission_time"])
        log = Dataset.from_pandas(df_log)
        log.push_to_hub(log_dataset)
        
    question = f"""
            <div style="text-align: center; font-size: 24px; font-weight: bold; margin-top: 20px;">
                Question:
            </div>
            <div style="text-align: center; font-size: 20px; margin-top: 10px;">
                {new_instruction}
            </div>
            """
    
    return (
        question,
        format_response_1_html(new_response_1),
        format_response_2_html(new_response_2),
        new_label_1,
        new_label_2,
        new_pair_name,
        "Your choice has been recorded. A new question is loaded!"
    )

def create_interface():
    instruction, response_1, response_2, label_1, label_2, pair_name = get_random_row()
    
    with gr.Blocks(theme=gr.themes.Default()) as demo:
        gr.HTML("""
        <div style="text-align: center;">
            <h1>Human-Likeness Voting System</h1>
            
        </div>
        """)
        gr.Markdown("This interface has been created to compare the performance of the human-like LLMs developed by our team with the models on which they were trained. The results of this study will be presented in a paper. Please ensure that your responses are fair and accurate when casting your vote and selecting the appropriate answer. We thank you for your contributions on behalf of the research team.")
        gr.Markdown("## Instructions")
        gr.Markdown(
            """
             1. First, read the provided question carefully.
             2. Second, read both responses carefully.
             3. Finally, select the model that best resembles a human in terms of response quality."""
        )
        
        current_instruction = gr.State(instruction)
        label_1_state = gr.State(label_1)
        label_2_state = gr.State(label_2)
        pair_name_state = gr.State(pair_name)
        question_display = gr.HTML(
            value=f"""
            <div style="text-align: center; font-size: 24px; font-weight: bold; margin-top: 20px;">
                Question:
            </div>
            <div style="text-align: center; font-size: 20px; margin-top: 10px;">
                {instruction}
            </div>
            """
        )

        with gr.Row():
            with gr.Column():
                response_1_display = gr.HTML(format_response_1_html(response_1))
            with gr.Column():
                response_2_display = gr.HTML(format_response_2_html(response_2))
        with gr.Row():
            selected_response = gr.Radio(
                ["Answer 1", "Answer 2"],
                label="Which answer is better?",
                interactive=True,
            )
            submit_btn = gr.Button("Submit Choice")
        
        status_output = gr.Textbox(
            interactive=False,
            label="Status",
            value="Select an answer and click Submit"
        )
        submit_btn.click(
            fn=submit_choice,
            inputs=[
                selected_response,
                current_instruction,
                label_1_state,
                label_2_state,
                pair_name_state
            ],
            outputs=[
                question_display,
                response_1_display,
                response_2_display,
                label_1_state,
                label_2_state,
                pair_name_state,
                status_output
            ]
        )
        
        return demo

if __name__ == "__main__":
    interface = create_interface()
    interface.launch(share=True)