arena / app.py
Kang Suhyun
[#37] Store ELO ratings in DB after calculation (#112)
5352a13 unverified
raw
history blame
7.8 kB
"""
It provides a platform for comparing the responses of two LLMs.
"""
import enum
from uuid import uuid4
from firebase_admin import firestore
import gradio as gr
import lingua
from db import db
from leaderboard import build_leaderboard
from leaderboard import SUPPORTED_LANGUAGES
from model import check_models
from model import supported_models
from rate_limit import set_token
import response
from response import get_responses
detector = lingua.LanguageDetectorBuilder.from_all_languages().build()
class VoteOptions(enum.Enum):
MODEL_A = "Model A is better"
MODEL_B = "Model B is better"
TIE = "Tie"
def vote(vote_button, response_a, response_b, model_a_name, model_b_name,
prompt, instruction, category, source_lang, target_lang):
doc_id = uuid4().hex
winner = VoteOptions(vote_button).name.lower()
deactivated_buttons = [gr.Button(interactive=False) for _ in range(3)]
outputs = deactivated_buttons + [gr.Row(visible=True)]
doc = {
"id": doc_id,
"prompt": prompt,
"instruction": instruction,
"model_a": model_a_name,
"model_b": model_b_name,
"model_a_response": response_a,
"model_b_response": response_b,
"winner": winner,
"timestamp": firestore.SERVER_TIMESTAMP
}
if category == response.Category.SUMMARIZE.value:
language_a = detector.detect_language_of(response_a)
language_b = detector.detect_language_of(response_b)
# TODO(#37): Move DB operations to db.py.
doc_ref = db.collection("arena-summarizations").document(doc_id)
doc["model_a_response_language"] = language_a.name.lower()
doc["model_b_response_language"] = language_b.name.lower()
doc_ref.set(doc)
return outputs
if category == response.Category.TRANSLATE.value:
if not source_lang or not target_lang:
raise gr.Error("Please select source and target languages.")
doc_ref = db.collection("arena-translations").document(doc_id)
doc["source_language"] = source_lang.lower()
doc["target_language"] = target_lang.lower()
doc_ref.set(doc)
return outputs
raise gr.Error("Please select a response type.")
# Removes the persistent orange border from the leaderboard, which
# appears due to the 'generating' class when using the 'every' parameter.
css = """
.leaderboard .generating {
border: none;
}
"""
with gr.Blocks(title="Yanolja Arena", css=css) as app:
token = gr.Textbox(visible=False)
set_token(app, token)
with gr.Row():
gr.HTML("""
<h1 style="text-align: center; font-size: 28px; margin-bottom: 16px">Yanolja Arena</h1>
<p style="text-align: center; font-size: 16px">Yanolja Arena helps find the best LLMs for summarizing and translating text. We compare two random models at a time and use an ELO rating system to score them.</p>
<p style="text-align: center; font-size: 16px">This is an open-source project. Check it out on <a href="https://github.com/yanolja/arena">GitHub</a>.</p>
""")
with gr.Accordion("How to Use", open=False):
gr.Markdown("""
1. **For Summaries:**
- Enter the text you want summarized into the prompt box.
2. **For Translations:**
- Choose the language you're translating from and to.
- Enter the text you want translated into the prompt box.
3. **Voting:**
- After you see both results, pick which one you think is better.
""")
with gr.Row():
category_radio = gr.Radio(
choices=[category.value for category in response.Category],
value=response.Category.SUMMARIZE.value,
label="Category",
info="The chosen category determines the instruction sent to the LLMs.")
source_language = gr.Dropdown(
choices=SUPPORTED_LANGUAGES,
value=lingua.Language.ENGLISH.name.capitalize(),
label="Source language",
info="Choose the source language for translation.",
interactive=True,
visible=False)
target_language = gr.Dropdown(
choices=SUPPORTED_LANGUAGES,
value=lingua.Language.KOREAN.name.capitalize(),
label="Target language",
info="Choose the target language for translation.",
interactive=True,
visible=False)
def update_language_visibility(category):
visible = category == response.Category.TRANSLATE.value
return {
source_language: gr.Dropdown(visible=visible),
target_language: gr.Dropdown(visible=visible)
}
category_radio.change(update_language_visibility, category_radio,
[source_language, target_language])
model_names = [gr.State(None), gr.State(None)]
response_boxes = [gr.State(None), gr.State(None)]
prompt_textarea = gr.TextArea(label="Prompt", lines=4)
submit = gr.Button()
with gr.Group():
with gr.Row():
response_boxes[0] = gr.Textbox(label="Model A", interactive=False)
response_boxes[1] = gr.Textbox(label="Model B", interactive=False)
with gr.Row(visible=False) as model_name_row:
model_names[0] = gr.Textbox(show_label=False)
model_names[1] = gr.Textbox(show_label=False)
with gr.Row(visible=False) as vote_row:
option_a = gr.Button(VoteOptions.MODEL_A.value)
option_b = gr.Button(VoteOptions.MODEL_B.value)
tie = gr.Button(VoteOptions.TIE.value)
instruction_state = gr.State("")
# The following elements need to be reset when the user changes
# the category, source language, or target language.
ui_elements = [
response_boxes[0], response_boxes[1], model_names[0], model_names[1],
instruction_state, model_name_row, vote_row
]
def reset_ui():
return [gr.Textbox(value="") for _ in range(4)
] + [gr.State(""),
gr.Row(visible=False),
gr.Row(visible=False)]
category_radio.change(fn=reset_ui, outputs=ui_elements)
source_language.change(fn=reset_ui, outputs=ui_elements)
target_language.change(fn=reset_ui, outputs=ui_elements)
submit_event = submit.click(
fn=lambda: [
gr.Radio(interactive=False),
gr.Dropdown(interactive=False),
gr.Dropdown(interactive=False),
gr.Button(interactive=False),
gr.Row(visible=False),
gr.Row(visible=False),
] + [gr.Button(interactive=True) for _ in range(3)],
outputs=[
category_radio, source_language, target_language, submit, vote_row,
model_name_row, option_a, option_b, tie
]).then(fn=get_responses,
inputs=[
prompt_textarea, category_radio, source_language,
target_language, token
],
outputs=response_boxes + model_names + [instruction_state])
submit_event.success(fn=lambda: gr.Row(visible=True), outputs=vote_row)
submit_event.then(
fn=lambda: [
gr.Radio(interactive=True),
gr.Dropdown(interactive=True),
gr.Dropdown(interactive=True),
gr.Button(interactive=True)
],
outputs=[category_radio, source_language, target_language, submit])
def deactivate_after_voting(option_button: gr.Button):
option_button.click(
fn=vote,
inputs=[option_button] + response_boxes + model_names + [
prompt_textarea, instruction_state, category_radio, source_language,
target_language
],
outputs=[option_a, option_b, tie, model_name_row]).then(
fn=lambda: [gr.Button(interactive=False) for _ in range(3)],
outputs=[option_a, option_b, tie])
for option in [option_a, option_b, tie]:
deactivate_after_voting(option)
build_leaderboard()
if __name__ == "__main__":
check_models(supported_models)
# We need to enable queue to use generators.
app.queue(api_open=False)
app.launch(debug=True, show_api=False)