Spaces:

loveblairsky
/

LLM-model-cards

Sleeping

Blair Yang

test

dfba357 4 months ago

No virus

4.37 kB

	import gradio as gr
	from Sample import sample_random_entry
	from Config import TOPICS
	import pandas as pd
	import os
	from threading import Lock

	lock = Lock()
	info_dict = {}

	def append_to_csv(output_path, row_data, header_names):
	# Acquire the lock before accessing the file
	with lock:
	# Check if file exists and is not empty
	if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
	print(f"Appending to {output_path}")
	# File exists and is not empty, append without headers
	# load first

	df = pd.DataFrame([row_data])
	df.to_csv(output_path, mode='a', header=False, index=False)
	# check and print
	with open(output_path, 'r') as f:
	print(f.read())
	else:
	print(f"Writing to {output_path}")
	# File does not exist or is empty, write with headers
	df = pd.DataFrame([row_data], columns=header_names)
	df.to_csv(output_path, mode='w', header=True, index=False)

	def sample_and_display(topic):
	# If a topic is selected, use it to sample a new entry
	global info_dict
	display_dict, info_dict = sample_random_entry(topic=topic) if topic else sample_random_entry()
	question_text = display_dict['qa']
	evaluation_card_text = display_dict['card']
	model_name = '' # Clear the model name
	completion_text = '' # Clear the completion text
	return question_text, evaluation_card_text, model_name, completion_text

	def evaluate_guess(reasoning, correctness, confidence, topic):
	global info_dict
	# Here your logic will go to evaluate the guess
	# Placeholder for the correct logic to determine the correct answer
	correct_answer = 'Correctly' if info_dict['correctness'] else 'Incorrectly'
	evaluation_response = "Correct" if correctness == correct_answer else "Incorrect"

	# Assuming info_dict is updated by sample_and_display function
	actual_model = info_dict.get('model', 'Unknown Model')
	actual_completion = info_dict.get('completion', 'No completion available.')

	# Update the completion text
	completion_text = f"Completion: {actual_completion}\n\nChoice: {chr(info_dict.get('verdict', 0) + 65)}"

	question_index = info_dict.get('index', -1)
	question_topic = topic
	output_path = f'responses/mmlu/{question_topic}/response.csv'
	entry = dict()

	entry['index'] = question_index
	entry['model'] = actual_model
	entry['reasoning'] = reasoning
	entry['correctness'] = correctness == correct_answer
	entry['confidence'] = confidence

	header_names = ['index', 'model', 'reasoning', 'correctness', 'confidence'] # Add other headers as necessary

	append_to_csv(output_path, entry, header_names)


	return evaluation_response, actual_model, completion_text

	# Initial sampling
	initial_topic = TOPICS['mmlu'][0] # Assuming TOPICS is a list of topics
	question_text, evaluation_card_text, model_name, completion_text = sample_and_display(initial_topic)

	with gr.Blocks() as app:
	topic = gr.Dropdown(choices=TOPICS['mmlu'], label="Select Topic", value=initial_topic)
	with gr.Row():
	with gr.Column(scale=2):
	evaluation_card = gr.Textbox(value=evaluation_card_text, label="Evaluation Card", interactive=False)
	model = gr.Textbox(value=model_name, label="Model", interactive=False)
	completion = gr.Textbox(value=completion_text, label="Model's Completion", interactive=False)
	with gr.Column(scale=1):
	question = gr.Textbox(value=question_text, label="Question", interactive=False)
	reasoning = gr.Textbox(lines=5, placeholder="Your reasoning (optional)")
	correctness = gr.Radio(choices=["Correct", "Incorrect"], label="I believe the model will answer this question")
	confidence = gr.Slider(minimum=0, maximum=10, step=1, label="Confidence")
	output_text = gr.Text(label="Evaluation Output")
	submit_button = gr.Button("Submit")
	next_button = gr.Button("Next Entry")

	submit_button.click(fn=evaluate_guess, inputs=[reasoning, correctness, confidence, topic], outputs=[output_text, model, completion])
	next_button.click(fn=sample_and_display, inputs=[topic], outputs=[question, evaluation_card, model, completion])

	app.launch()