File size: 4,562 Bytes
ae6a8c0
de1d92a
 
91143ec
 
 
de1d92a
91143ec
de1d92a
 
91143ec
 
 
 
 
01022c9
91143ec
dfba357
 
91143ec
 
dfba357
 
 
91143ec
01022c9
91143ec
 
 
 
de1d92a
 
 
 
 
 
 
 
1a73201
de1d92a
 
 
 
 
2ec5af9
09056a8
 
abedf13
 
de1d92a
 
 
abedf13
de1d92a
 
91143ec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
de1d92a
7e3fbab
de1d92a
 
1a73201
7e3fbab
 
de1d92a
7e3fbab
de1d92a
 
 
 
7e3fbab
abedf13
7e3fbab
63eddff
09056a8
1a73201
 
7e3fbab
de1d92a
 
 
1a73201
7e3fbab
de1d92a
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import gradio as gr
from Sample import sample_random_entry
from Config import TOPICS
import pandas as pd
import os
from threading import Lock

lock = Lock()
info_dict = {}

def append_to_csv(output_path, row_data, header_names):
    # Acquire the lock before accessing the file
    with lock:
        # Check if file exists and is not empty
        if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
            print(f"Appending to {output_path}")
            # File exists and is not empty, append without headers
            # load first
        
            df = pd.DataFrame([row_data])
            df.to_csv(output_path, mode='a', header=False, index=False)
            # check and print
            with open(output_path, 'r') as f:
                print(f.read())
        else:
            print(f"Writing to {output_path}")
            # File does not exist or is empty, write with headers
            df = pd.DataFrame([row_data], columns=header_names)
            df.to_csv(output_path, mode='w', header=True, index=False)

def sample_and_display(topic):
    # If a topic is selected, use it to sample a new entry
    global info_dict
    display_dict, info_dict = sample_random_entry(topic=topic) if topic else sample_random_entry()
    question_text = display_dict['qa']
    evaluation_card_text = display_dict['card']
    model_name = ''  # Clear the model name
    completion_text = ''  # Clear the completion text
    return '', question_text, evaluation_card_text, model_name, completion_text

def evaluate_guess(reasoning, correctness, confidence, topic):
    global info_dict
    # Here your logic will go to evaluate the guess
    # Placeholder for the correct logic to determine the correct answer
    correct_answer = 'Correct' if info_dict['correctness'] else 'Incorrect'
    # print(correctness)
    # print(correct_answer)
    evaluation_response = "Correct" if correctness == correct_answer else "Incorrect"
    
    # Assuming info_dict is updated by sample_and_display function
    actual_model = info_dict.get('model', 'Unknown Model')
    actual_completion = info_dict.get('completion', 'No completion available.')
    
    # Update the completion text
    completion_text = f"Completion: {actual_completion}\n\nChoice: {chr(info_dict.get('verdict', 0) + 65)}"

    question_index = info_dict.get('index', -1)
    question_topic = topic
    output_path = f'responses/mmlu/{question_topic}/response.csv'
    entry = dict()

    entry['index'] = question_index
    entry['model'] = actual_model
    entry['reasoning'] = reasoning
    entry['correctness'] = correctness == correct_answer
    entry['confidence'] = confidence

    header_names = ['index', 'model', 'reasoning', 'correctness', 'confidence']  # Add other headers as necessary   

    append_to_csv(output_path, entry, header_names)


    return evaluation_response, actual_model, completion_text

# Initial sampling
initial_topic = TOPICS['mmlu'][0]  # Assuming TOPICS is a list of topics
correct_text, question_text, evaluation_card_text, model_name, completion_text = sample_and_display(initial_topic)

with gr.Blocks() as app:
    topic = gr.Dropdown(choices=TOPICS['mmlu'], label="Select Topic", value=initial_topic)
    with gr.Row():
        with gr.Column(scale=2):
            evaluation_card = gr.Textbox(value=evaluation_card_text, label="Evaluation Card", interactive=False)
            model = gr.Textbox(value=model_name, label="Model", interactive=False)
            completion = gr.Textbox(value=completion_text, label="Model's Completion", interactive=False)
        with gr.Column(scale=1):
            question = gr.Textbox(value=question_text, label="Question", interactive=False)
            reasoning = gr.Textbox(lines=5, placeholder="Your reasoning (optional)")
            correctness = gr.Radio(choices=["Correct", "Incorrect"], label="I believe the model will answer this question")
            confidence = gr.Slider(minimum=1, maximum=5, step=1, value=3, label="Confidence")
            output_text = gr.Textbox(value=correct_text, label="Evaluation Output", interactive=False)
            # output_text = gr.Text(label="Evaluation Output")
            submit_button = gr.Button("Submit")
            next_button = gr.Button("Next Entry")
    
    submit_button.click(fn=evaluate_guess, inputs=[reasoning, correctness, confidence, topic], outputs=[output_text, model, completion])
    next_button.click(fn=sample_and_display, inputs=[topic], outputs=[output_text, question, evaluation_card, model, completion])

app.launch()