eduardo-alvarez's picture
update scores
367c1b1
import pandas as pd
import requests
import os
import gradio
# work around due to HF Spaces bug
if gradio.__version__ != '4.16.0':
os.system("pip uninstall -y gradio")
os.system("pip install gradio==4.16.0")
import gradio as gr
from info.train_a_model import (
LLM_BENCHMARKS_TEXT)
from info.submit import (
SUBMIT_TEXT)
from info.deployment import (
DEPLOY_TEXT)
from info.programs import (
PROGRAMS_TEXT)
from info.citation import(
CITATION_TEXT)
from info.validated_chat_models import(
VALIDATED_CHAT_MODELS)
from info.about import(
ABOUT)
from src.processing import filter_benchmarks_table
inference_endpoint_url = os.environ['inference_endpoint_url']
submission_form_endpoint_url = os.environ['submission_form_endpoint_url']
inference_concurrency_limit = os.environ['inference_concurrency_limit']
demo = gr.Blocks()
with demo:
gr.HTML("""<h1 align="center" id="space-title">πŸ€—Powered-by-Intel LLM Leaderboard πŸ’»</h1>""")
gr.Markdown("""This leaderboard is designed to evaluate, score, and rank open-source LLMs
that have been pre-trained or fine-tuned on Intel Hardware 🦾. To submit your model for evaluation,
follow the instructions and complete the form in the 🏎️ Submit tab. Models submitted to the leaderboard are evaluated
on the Intel Developer Cloud ☁️. The evaluation platform consists of Gaudi Accelerators and Xeon CPUs running benchmarks from
the [Eleuther AI Language Model Evaluation Harness](https://github.com/EleutherAI/lm-evaluation-harness).""")
gr.Markdown("""A special shout-out to the πŸ€— [Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard)
team for generously sharing their code and best
practices, ensuring that AI Developers have a valuable and enjoyable tool at their disposal.""")
def submit_to_endpoint(model_name, revision_name, model_type, hw_type, terms, precision, weight_type, training_infra, affiliation, base_model):
# Construct the data payload to send
data = {
"model_name": model_name,
"revision_name": revision_name,
"model_type": model_type,
"hw_type": hw_type,
"terms": terms,
"precision": precision,
"weight_type": weight_type,
"training_infrastructure": training_infra,
"affiliation": affiliation,
"base_model": base_model
}
# URL of the endpoint expecting the HTTP request
url = submission_form_endpoint_url
for key, value in data.items():
if value == "" or (key == "terms" and value is False):
return f"❌ Failed Submission: '{key}' ensure all fields are completed and that you have agreed to evaluation terms."
try:
response = requests.post(url, json=data)
if response.status_code == 200:
return "βœ… Submission successful! Please allow for 5 - 10 days for model evaluation to be completed. We will contact you \
through your model's discussion forum if we encounter any issues with your submission."
else:
return f"Submission failed with status code {response.status_code}"
except Exception as e:
return f"❌Failed to submit due to an error: {str(e)}"
with gr.Accordion("Chat with Top Models on the Leaderboard Here πŸ’¬", open=False):
chat_model_dropdown = gr.Dropdown(
choices=VALIDATED_CHAT_MODELS,
label="Select a leaderboard model to chat with. ",
multiselect=False,
value=VALIDATED_CHAT_MODELS[0],
interactive=True,
)
#chat_model_selection = chat_model_dropdown.value
chat_model_selection = 'Intel/neural-chat-7b-v1-1'
def call_api_and_stream_response(query, chat_model):
"""
Call the API endpoint and yield characters as they are received.
This function simulates streaming by yielding characters one by one.
"""
url = inference_endpoint_url
params = {"query": query, "selected_model": chat_model}
with requests.get(url, json=params, stream=True) as r: # Use params for query parameters
for chunk in r.iter_content(chunk_size=1):
if chunk:
yield chunk.decode()
def get_response(query, history):
"""
Wrapper function to call the streaming API and compile the response.
"""
response = ''
for char in call_api_and_stream_response(query, chat_model=chat_model_selection):
if char == '<': # This is stopping condition; adjust as needed.
break
response += char
yield [(f"πŸ€– Response from LLM: {chat_model_selection}", response)] # Correct format for Gradio Chatbot
#
chatbot = gr.Chatbot()
msg = gr.Textbox()
submit = gr.Button("Submit")
clear = gr.Button("Clear")
def user(user_message, history):
return "", history + [[user_message, None]]
def clear_chat(*args):
return [] # Returning an empty list to signify clearing the chat, adjust as per Gradio's capabilities
submit.click(
fn=get_response,
inputs=[msg, chatbot],
outputs=chatbot
)
clear.click(
fn=clear_chat,
inputs=None,
outputs=chatbot
)
with gr.Tabs(elem_classes="tab-buttons") as tabs:
with gr.TabItem("πŸ† LLM Leaderboard", elem_id="llm-benchmark-table", id=0):
with gr.Row():
with gr.Column():
filter_hw = gr.CheckboxGroup(choices=["Gaudi","Xeon","GPU Max","Arc GPU","Core Ultra"],
label="Select Training Platform*",
elem_id="compute_platforms",
value=["Gaudi","Xeon","GPU Max","Arc GPU","Core Ultra"])
filter_platform = gr.CheckboxGroup(choices=["Intel Developer Cloud","AWS","Azure","Google Cloud Platform","Local"],
label="Training Infrastructure*",
elem_id="training_infra",
value=["Intel Developer Cloud","AWS","Azure","Google Cloud Platform","Local"])
filter_affiliation = gr.CheckboxGroup(choices=["No Affiliation","Intel Innovator","Student Ambassador","Intel Liftoff", "Intel Engineering", "Other"],
label="Intel Program Affiliation",
elem_id="program_affiliation",
value=["No Affiliation","Intel Innovator","Student Ambassador","Intel Liftoff", "Intel Engineering", "Other"])
with gr.Column():
filter_size = gr.CheckboxGroup(choices=[1,2,3,5,7,13,35,60,70,100],
label="Model Sizes (Billion of Parameters)",
elem_id="parameter_size",
value=[1,2,3,5,7,13,35,60,70,100])
filter_precision = gr.CheckboxGroup(choices=["fp32","fp16","bf16","int8","fp8", "int4"],
label="Model Precision",
elem_id="precision",
value=["fp32","fp16","bf16","int8","fp8", "int4"])
filter_type = gr.CheckboxGroup(choices=["pretrained","fine-tuned","chat-models","merges/moerges"],
label="Model Types",
elem_id="model_types",
value=["pretrained","fine-tuned","chat-models","merges/moerges"])
inbox_text = gr.CheckboxGroup(label = """Inference Tested Column Legend: 🟨 = Gaudi, 🟦 = Xeon, πŸŸ₯ = GPU Max, 🟠 = Core Ultra, 🟒 = Arc GPU (Please see "❓About" tab for more info)""")
# formatting model name and adding links
color = '#2f82d4'
def make_clickable(row):
return f'<a href="https://huggingface.co/{row["Model"]}" target="_blank" style="color: {color}; text-decoration: underline;">{row["Model"]}</a>'
initial_df = pd.read_csv("./status/leaderboard_status_050124.csv")
initial_df["Model"] = initial_df.apply(make_clickable, axis=1)
initial_df = initial_df.sort_values(by='Average', ascending=False)
def update_df(hw_selected, platform_selected, affiliation_selected, size_selected, precision_selected, type_selected):
filtered_df = filter_benchmarks_table(df=initial_df, hw_selected=hw_selected, platform_selected=platform_selected,
affiliation_selected=affiliation_selected, size_selected=size_selected,
precision_selected=precision_selected, type_selected=type_selected)
return filtered_df
initial_filtered_df = update_df(["Gaudi","Xeon","GPU Max","Arc GPU","Core Ultra"],
["Intel Developer Cloud","AWS","Azure","Google Cloud Platform","Local"],
["No Affiliation","Intel Innovator","Student Ambassador","Intel Liftoff", "Intel Engineering", "Other"],
[1,2,3,5,7,13,35,60,70,100],
["fp32","fp16","bf16","int8","fp8", "int4"],
["pretrained","fine-tuned","chat-models","merges/moerges"])
gradio_df_display = gr.Dataframe(value=initial_filtered_df, headers=["Inference Tested","Model","Average","ARC","HellaSwag","MMLU",
"TruthfulQA","Winogrande","Training Hardware","Model Type","Precision",
"Size","Infrastructure","Affiliation"],
datatype=["html","html","str","str","str","str","str","str","str","str","str","str","str","str"])
filter_hw.change(fn=update_df,
inputs=[filter_hw, filter_platform, filter_affiliation, filter_size, filter_precision, filter_type],
outputs=[gradio_df_display])
filter_platform.change(fn=update_df,
inputs=[filter_hw, filter_platform, filter_affiliation, filter_size, filter_precision, filter_type],
outputs=[gradio_df_display])
filter_affiliation.change(fn=update_df,
inputs=[filter_hw, filter_platform, filter_affiliation, filter_size, filter_precision, filter_type],
outputs=[gradio_df_display])
filter_size.change(fn=update_df,
inputs=[filter_hw, filter_platform, filter_affiliation, filter_size, filter_precision, filter_type],
outputs=[gradio_df_display])
filter_precision.change(fn=update_df,
inputs=[filter_hw, filter_platform, filter_affiliation, filter_size, filter_precision, filter_type],
outputs=[gradio_df_display])
filter_type.change(fn=update_df,
inputs=[filter_hw, filter_platform, filter_affiliation, filter_size, filter_precision, filter_type],
outputs=[gradio_df_display])
with gr.TabItem("🧰 Train a Model", elem_id="getting-started", id=1):
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
with gr.TabItem("πŸš€ Deployment Tips", elem_id="deployment-tips", id=2):
gr.Markdown(DEPLOY_TEXT, elem_classes="markdown-text")
with gr.TabItem("πŸ‘©β€πŸ’» Developer Programs", elem_id="hardward-program", id=3):
gr.Markdown(PROGRAMS_TEXT, elem_classes="markdown-text")
with gr.TabItem("❓ About ", elem_id="about", id=5):
gr.Markdown(ABOUT, elem_classes="markdown-text")
with gr.TabItem("🏎️ Submit", elem_id="submit", id=4):
gr.Markdown(SUBMIT_TEXT, elem_classes="markdown-text")
with gr.Row():
gr.Markdown("# Submit Model for Evaluation 🏎️", elem_classes="markdown-text")
with gr.Row():
with gr.Column():
model_name_textbox = gr.Textbox(label="Model name",
info = """ Name of Model in the Hub. For example: 'Intel/neural-chat-7b-v1-1'""",)
revision_name_textbox = gr.Textbox(label="Revision commit (Branch)", placeholder="main")
model_type = gr.Dropdown(
choices=["pretrained","fine-tuned","chat models","merges/moerges"],
label="Model type",
multiselect=False,
value="pretrained",
interactive=True,
)
hw_type = gr.Dropdown(
choices=["Gaudi","Xeon","GPU Max","Arc GPU","Core Ultra"],
label="Training Hardware",
multiselect=False,
value="Gaudi",
interactive=True,
)
terms = gr.Checkbox(
label="Check if you agree to having your model evaluated and published to the leaderboard by our team.",
value=False,
interactive=True,
)
submit_button = gr.Button("πŸ€— Submit Eval πŸ’»")
submission_result = gr.Markdown()
with gr.Column():
precision = gr.Dropdown(
choices=["fp32","fp16","bf16","int8","fp8", "int4"],
label="Precision",
multiselect=False,
value="fp16",
interactive=True,
)
weight_type = gr.Dropdown(
choices=["Original", "Adapter", "Delta"],
label="Weights type",
multiselect=False,
value="Original",
interactive=True,
info = """ Select the appropriate weights. If you have fine-tuned or adapted a model with PEFT or Delta-Tuning you likely have
LoRA Adapters or Delta Weights.""",
)
training_infra = gr.Dropdown(
choices=["Intel Developer Cloud","AWS","Azure","Google Cloud Platform","Local"],
label="Training Infrastructure",
multiselect=False,
value="Intel Developer Cloud",
interactive=True,
info = """ Select the infrastructure that the model was developed on.
Local is the ideal choice for Core Ultra, ARC GPUs, and local data center infrastructure.""",
)
affiliation = gr.Dropdown(
choices=["No Affiliation","Intel Innovator","Student Ambassador","Intel Liftoff", "Intel Engineering", "Other"],
label="Affiliation with Intel",
multiselect=False,
value="No Affiliation",
interactive=True,
info = """ Select "No Affiliation" if not part of any Intel programs.""",
)
base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
submit_button.click(
fn=submit_to_endpoint,
inputs=[model_name_textbox, revision_name_textbox, model_type, hw_type, terms, precision, weight_type, training_infra, affiliation, base_model_name_textbox],
outputs=submission_result)
with gr.Accordion("πŸ“™ Citation", open=False):
citation =gr.Textbox(value = CITATION_TEXT,
lines=6,
label="Use the following to cite this content")
gr.Markdown("""<div style="display: flex; justify-content: center;"> <p> Intel, the Intel logo and Gaudi are trademarks of Intel Corporation or its subsidiaries.
*Other names and brands may be claimed as the property of others.
</p> </div>""")
demo.queue()
demo.launch(share=False)