import gradio as gr import pandas as pd import random import firebase_admin from firebase_admin import credentials from firebase_admin import firestore from trueskill import Rating import trueskill CSV_FILE_PATH = "qa_pairs.csv" cred = credentials.Certificate("unlpboard_f.json") firebase_admin.initialize_app(cred) def list_models(): df = pd.read_csv(CSV_FILE_PATH) return df['model'].unique().tolist() def list_questions(): df = pd.read_csv(CSV_FILE_PATH) return df['question'].unique().tolist() def fetch_questions(): questions_ref = db.collection('questions') docs = questions_ref.stream() questions_list = [] for doc in docs: question = doc.to_dict() questions_list.append(question) return questions_list def display_answers(question, model1, model2, df): #df = pd.read_csv(CSV_FILE_PATH) answers = { model1: "No answer available for Model 1", model2: "No answer available for Model 2", } for model in [model1, model2]: filtered_df = df[(df['question'] == question) & (df['model'] == model)] if not filtered_df.empty: answers[model] = f"**Answer:**\n{filtered_df['answer'].iloc[0]}" return answers[model1], answers[model2] def update_b(q,m1,a1,m2,a2): print('Model1: ', random_model2) print('Model2: ', random_model2) q, m1, a1, m2, a2 = update_symbols(q, m1, a1, m2, a2) b1 = gr.Button("Vote for Model 1",interactive=True) b2 = gr.Button("It’s a tie!",interactive=True) b3 = gr.Button("Vote for Model 2",interactive=True) b4 = gr.Button("START!", visible = False) return q, m1, a1, m2, a2, b1, b2, b3, b4 def update_symbols1(q,m1,a1,m2,a2): print("Voted for Model 1") log_vote( model1=m1, model2=m2, question=q, output1=a1, output2=a2, outcome=m1 ) votes_ref = db.collection('votes') vote_doc = votes_ref.document(m1).get() elo_count_1 = vote_doc.get('elo_rating') elo1 = Rating(elo_count_1) if vote_doc.exists: votes_ref.document(m1).update({'win_count': firestore.Increment(1)}) else: votes_ref.document(m1).set({'win_count': 1}) vote_doc = votes_ref.document(m2).get() elo_count_2 = vote_doc.get('elo_rating') elo2 = Rating(elo_count_2) elo1, elo2 = trueskill.rate_1vs1(elo1, elo2) votes_ref.document(m2).update({'elo_rating': elo2.mu}) votes_ref.document(m1).update({'elo_rating': elo1.mu}) if vote_doc.exists: votes_ref.document(m2).update({'loss_count': firestore.Increment(1)}) else: votes_ref.document(m2).set({'loss_count': 1}) return update_symbols(q, m1, a1, m2, a2) def update_symbols2(q, m1, a1, m2, a2): print("Voted for Spare") log_vote( model1=m1, model2=m2, question=q, output1=a1, output2=a2, outcome='tie' ) #update_total_votes() return update_symbols(q, m1, a1, m2, a2) def update_symbols3(q, m1, a1, m2, a2): print("Voted for Model 2") log_vote( model1=m1, model2=m2, question=q, output1=a1, output2=a2, outcome=m2 ) votes_ref = db.collection('votes') vote_doc = votes_ref.document(m2).get() elo_count_2 = vote_doc.get('elo_rating') elo2 = Rating(elo_count_2) if vote_doc.exists: votes_ref.document(m2).update({'win_count': firestore.Increment(1)}) else: votes_ref.document(m2).set({'win_count': 1}) vote_doc = votes_ref.document(m1).get() elo_count_1 = vote_doc.get('elo_rating') elo1 = Rating(elo_count_1) elo1, elo2 = trueskill.rate_1vs1(elo2, elo1) votes_ref.document(m2).update({'elo_rating': elo2.mu}) votes_ref.document(m1).update({'elo_rating': elo1.mu}) if vote_doc.exists: votes_ref.document(m1).update({'loss_count': firestore.Increment(1)}) else: votes_ref.document(m1).set({'loss_count': 1}) #update_total_votes() return update_symbols(q, m1, a1, m2, a2) def update_symbols(q,m1,a1,m2,a2): random_question = random.choice(questions) random_model1, random_model2 = random.sample(models, 2) answer1, answer2 = display_answers(random_question, random_model1, random_model2, combined_df) m1 = gr.Markdown(f"{random_model1}", visible=False) a1 = gr.Markdown(answer1) q = gr.Markdown(f"{random_question}") m2 = gr.Markdown(f"{random_model2}", visible=False) a2 = gr.Markdown(answer2) return q,m1,a1,m2,a2 def update_total_votes(): votes_ref = db.collection('votes') vote_doc = votes_ref.document('total').get() if vote_doc.exists: votes_ref.document('total').update({'count': firestore.Increment(1)}) else: votes_ref.document('total').set({'count': 1}) def log_vote(model1, model2, question, output1, output2, outcome): # Reference to the Firestore collection where votes will be logged votes_log_ref = db.collection('votes_log') # Create a new document for this vote vote_data = { 'model1': model1, 'model2': model2, 'question': question, 'output1': output1, 'output2': output2, 'outcome': outcome, 'timestamp': firestore.SERVER_TIMESTAMP } # Add the vote document to Firestore votes_log_ref.add(vote_data) def fetch_and_format_leaderboard(): vote_counts_ref = db.collection('votes') docs = vote_counts_ref.stream() leaderboard = [] for doc in docs: model_data = doc.to_dict() model_name = doc.id win_count = model_data.get('win_count', 0) loss_count = model_data.get('loss_count', 0) total_matches = win_count + loss_count win_rate = (win_count / total_matches) * 100 if total_matches > 0 else 0 elo_rating = model_data.get('elo_rating', 0) leaderboard.append({ "model": model_name, "win_rate": win_rate, "TrueSkill rating": elo_rating }) # Sort the leaderboard by elo_rating in descending order leaderboard.sort(key=lambda x: x['win_rate'], reverse=True) leaderboard_df = pd.DataFrame(leaderboard) leaderboard_df['Rank'] = [1,2,3,4,5,6]#leaderboard_df['win_rate'].rank(method='max', ascending=False).astype(int) # Reorder columns to match your requirement leaderboard_df = leaderboard_df[['Rank', 'model', 'win_rate', 'TrueSkill rating' ]] # Format the DataFrame as a string for display; you might adjust this part based on how Gradio expects the data # For Gradio, you might directly return the DataFrame instead of converting it to a string return leaderboard_df #questions = list_questions() db = firestore.client() def fetch_questions_c(collection): questions_ref = db.collection(collection) docs = questions_ref.stream() questions_list = [] for doc in docs: question = doc.to_dict() questions_list.append(question) return questions_list codekobzar = fetch_questions_c('codekobzar') gpt = fetch_questions_c('gpt-4') llama = fetch_questions_c('llama-2-70b-chat') sherlocknorag = fetch_questions_c('sherlock-no-rag') sherlockrag = fetch_questions_c('sherlock-rag') ukrainenow = fetch_questions_c('ukrainenow') df1 = pd.DataFrame(codekobzar) df2 = pd.DataFrame(gpt) df3 = pd.DataFrame(llama) df4 = pd.DataFrame(sherlocknorag) df5 = pd.DataFrame(sherlockrag) df6 = pd.DataFrame(ukrainenow) df1['model'] = 'codekobzar' df2['model'] = 'gpt-4' df3['model'] = 'llama-2-70b-chat' df4['model'] = 'sherlock-no-rag' df5['model'] = 'sherlock-rag' df6['model'] = 'ukrainenow' combined_df = pd.concat([df1, df2, df3, df4, df5, df6], ignore_index=True) combined_df.drop('input',axis=1,inplace=True) combined_df.rename(columns={'instruction': 'question', 'output': 'answer'}, inplace=True) models = ['codekobzar','gpt-4','llama-2-70b-chat','sherlock-no-rag','sherlock-rag','ukrainenow']#list_models() votes_ref = db.collection('votes') for model in models: vote_doc = votes_ref.document(model).get() if vote_doc.exists: print("-------") else: votes_ref.document(model).set({'win_count': 0}) votes_ref.document(model).set({'loss_count': 0}) votes_ref.document(model).set({'elo_rating': 25}) random_question = 'Click any button to start!' random_model1, random_model2 = '1', '2' answer1, answer2 = display_answers(random_question, random_model1, random_model2,combined_df) questions = [] questions_ = fetch_questions() for question in questions_: questions.append(question['question_text']) votes_ref = db.collection('votes') def create_app(): print('-----------------------') print(random_question) print(random_model1) print('-----!!!!!!!!!!!!!') with gr.Blocks() as app: q = gr.Markdown(f"### Question: {random_question}") with gr.Row(): with gr.Column(): m1 = gr.Markdown(f"{random_model1}", visible=False) a1 = gr.Markdown(answer1) with gr.Column(): m2 = gr.Markdown(f"{random_model2}", visible=False) a2 = gr.Markdown(answer2) with gr.Row(): b1 = gr.Button("Vote for Model 1",interactive=False) b2 = gr.Button("It’s a tie!",interactive=False) b3 = gr.Button("Vote for Model 2",interactive=False) with gr.Row(): b4 = gr.Button("START!", interactive=True) #with gr.Row(): # b5 = gr.Button("Show Leaderboard") initial_leaderboard_data = fetch_and_format_leaderboard() #leaderboard_display = gr.Textbox(value=initial_leaderboard_data,label="Leaderboard", placeholder="Leaderboard will be displayed here.",lines=30, visible=True) leaderboard_display = gr.Dataframe(value=initial_leaderboard_data, label="Leaderboard") #b5.click(fn=fetch_and_format_leaderboard, inputs=[], outputs=leaderboard_display) b4.click(update_b, inputs=[q,m1,a1,m2,a2], outputs=[q,m1,a1,m2,a2,b1,b2,b3, b4]) b1.click(update_symbols1, inputs=[q,m1,a1,m2,a2], outputs=[q,m1,a1,m2,a2]) b2.click(update_symbols2, inputs=[q, m1, a1, m2, a2], outputs=[q, m1, a1, m2, a2]) b3.click(update_symbols3, inputs=[q, m1, a1, m2, a2], outputs=[q, m1, a1, m2, a2]) leaderboard_button = gr.Button("Refresh Leaderboard") leaderboard_button.click(fn=fetch_and_format_leaderboard, inputs=[], outputs=leaderboard_display) return app app = create_app() app.launch()