LLM-model-cards / app.py
Blair Yang
test
dfba357
raw history blame
No virus
4.37 kB
import gradio as gr
from Sample import sample_random_entry
from Config import TOPICS
import pandas as pd
import os
from threading import Lock
lock = Lock()
info_dict = {}
def append_to_csv(output_path, row_data, header_names):
# Acquire the lock before accessing the file
with lock:
# Check if file exists and is not empty
if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
print(f"Appending to {output_path}")
# File exists and is not empty, append without headers
# load first
df = pd.DataFrame([row_data])
df.to_csv(output_path, mode='a', header=False, index=False)
# check and print
with open(output_path, 'r') as f:
print(f.read())
else:
print(f"Writing to {output_path}")
# File does not exist or is empty, write with headers
df = pd.DataFrame([row_data], columns=header_names)
df.to_csv(output_path, mode='w', header=True, index=False)
def sample_and_display(topic):
# If a topic is selected, use it to sample a new entry
global info_dict
display_dict, info_dict = sample_random_entry(topic=topic) if topic else sample_random_entry()
question_text = display_dict['qa']
evaluation_card_text = display_dict['card']
model_name = '' # Clear the model name
completion_text = '' # Clear the completion text
return question_text, evaluation_card_text, model_name, completion_text
def evaluate_guess(reasoning, correctness, confidence, topic):
global info_dict
# Here your logic will go to evaluate the guess
# Placeholder for the correct logic to determine the correct answer
correct_answer = 'Correctly' if info_dict['correctness'] else 'Incorrectly'
evaluation_response = "Correct" if correctness == correct_answer else "Incorrect"
# Assuming info_dict is updated by sample_and_display function
actual_model = info_dict.get('model', 'Unknown Model')
actual_completion = info_dict.get('completion', 'No completion available.')
# Update the completion text
completion_text = f"Completion: {actual_completion}\n\nChoice: {chr(info_dict.get('verdict', 0) + 65)}"
question_index = info_dict.get('index', -1)
question_topic = topic
output_path = f'responses/mmlu/{question_topic}/response.csv'
entry = dict()
entry['index'] = question_index
entry['model'] = actual_model
entry['reasoning'] = reasoning
entry['correctness'] = correctness == correct_answer
entry['confidence'] = confidence
header_names = ['index', 'model', 'reasoning', 'correctness', 'confidence'] # Add other headers as necessary
append_to_csv(output_path, entry, header_names)
return evaluation_response, actual_model, completion_text
# Initial sampling
initial_topic = TOPICS['mmlu'][0] # Assuming TOPICS is a list of topics
question_text, evaluation_card_text, model_name, completion_text = sample_and_display(initial_topic)
with gr.Blocks() as app:
topic = gr.Dropdown(choices=TOPICS['mmlu'], label="Select Topic", value=initial_topic)
with gr.Row():
with gr.Column(scale=2):
evaluation_card = gr.Textbox(value=evaluation_card_text, label="Evaluation Card", interactive=False)
model = gr.Textbox(value=model_name, label="Model", interactive=False)
completion = gr.Textbox(value=completion_text, label="Model's Completion", interactive=False)
with gr.Column(scale=1):
question = gr.Textbox(value=question_text, label="Question", interactive=False)
reasoning = gr.Textbox(lines=5, placeholder="Your reasoning (optional)")
correctness = gr.Radio(choices=["Correct", "Incorrect"], label="I believe the model will answer this question")
confidence = gr.Slider(minimum=0, maximum=10, step=1, label="Confidence")
output_text = gr.Text(label="Evaluation Output")
submit_button = gr.Button("Submit")
next_button = gr.Button("Next Entry")
submit_button.click(fn=evaluate_guess, inputs=[reasoning, correctness, confidence, topic], outputs=[output_text, model, completion])
next_button.click(fn=sample_and_display, inputs=[topic], outputs=[question, evaluation_card, model, completion])
app.launch()