import pandas as pd import requests import os import gradio # work around due to HF Spaces bug if gradio.__version__ != '4.16.0': os.system("pip uninstall -y gradio") os.system("pip install gradio==4.16.0") import gradio as gr from info.train_a_model import ( LLM_BENCHMARKS_TEXT) from info.submit import ( SUBMIT_TEXT) from info.deployment import ( DEPLOY_TEXT) from info.programs import ( PROGRAMS_TEXT) from info.citation import( CITATION_TEXT) from info.validated_chat_models import( VALIDATED_CHAT_MODELS) from info.about import( ABOUT) from src.processing import filter_benchmarks_table inference_endpoint_url = os.environ['inference_endpoint_url'] submission_form_endpoint_url = os.environ['submission_form_endpoint_url'] inference_concurrency_limit = os.environ['inference_concurrency_limit'] demo = gr.Blocks() with demo: gr.HTML("""

πŸ€—Powered-by-Intel LLM Leaderboard πŸ’»

""") gr.Markdown("""This leaderboard is designed to evaluate, score, and rank open-source LLMs that have been pre-trained or fine-tuned on Intel Hardware 🦾. To submit your model for evaluation, follow the instructions and complete the form in the 🏎️ Submit tab. Models submitted to the leaderboard are evaluated on the Intel Developer Cloud ☁️. The evaluation platform consists of Gaudi Accelerators and Xeon CPUs running benchmarks from the [Eleuther AI Language Model Evaluation Harness](https://github.com/EleutherAI/lm-evaluation-harness).""") gr.Markdown("""A special shout-out to the πŸ€— [Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard) team for generously sharing their code and best practices, ensuring that AI Developers have a valuable and enjoyable tool at their disposal.""") def submit_to_endpoint(model_name, revision_name, model_type, hw_type, terms, precision, weight_type, training_infra, affiliation, base_model): # Construct the data payload to send data = { "model_name": model_name, "revision_name": revision_name, "model_type": model_type, "hw_type": hw_type, "terms": terms, "precision": precision, "weight_type": weight_type, "training_infrastructure": training_infra, "affiliation": affiliation, "base_model": base_model } # URL of the endpoint expecting the HTTP request url = submission_form_endpoint_url for key, value in data.items(): if value == "" or (key == "terms" and value is False): return f"❌ Failed Submission: '{key}' ensure all fields are completed and that you have agreed to evaluation terms." try: response = requests.post(url, json=data) if response.status_code == 200: return "βœ… Submission successful! Please allow for 5 - 10 days for model evaluation to be completed. We will contact you \ through your model's discussion forum if we encounter any issues with your submission." else: return f"Submission failed with status code {response.status_code}" except Exception as e: return f"❌Failed to submit due to an error: {str(e)}" with gr.Accordion("Chat with Top Models on the Leaderboard Here πŸ’¬", open=False): chat_model_dropdown = gr.Dropdown( choices=VALIDATED_CHAT_MODELS, label="Select a leaderboard model to chat with. ", multiselect=False, value=VALIDATED_CHAT_MODELS[0], interactive=True, ) #chat_model_selection = chat_model_dropdown.value chat_model_selection = 'Intel/neural-chat-7b-v1-1' def call_api_and_stream_response(query, chat_model): """ Call the API endpoint and yield characters as they are received. This function simulates streaming by yielding characters one by one. """ url = inference_endpoint_url params = {"query": query, "selected_model": chat_model} with requests.get(url, json=params, stream=True) as r: # Use params for query parameters for chunk in r.iter_content(chunk_size=1): if chunk: yield chunk.decode() def get_response(query, history): """ Wrapper function to call the streaming API and compile the response. """ response = '' for char in call_api_and_stream_response(query, chat_model=chat_model_selection): if char == '<': # This is stopping condition; adjust as needed. break response += char yield [(f"πŸ€– Response from LLM: {chat_model_selection}", response)] # Correct format for Gradio Chatbot # chatbot = gr.Chatbot() msg = gr.Textbox() submit = gr.Button("Submit") clear = gr.Button("Clear") def user(user_message, history): return "", history + [[user_message, None]] def clear_chat(*args): return [] # Returning an empty list to signify clearing the chat, adjust as per Gradio's capabilities submit.click( fn=get_response, inputs=[msg, chatbot], outputs=chatbot ) clear.click( fn=clear_chat, inputs=None, outputs=chatbot ) with gr.Tabs(elem_classes="tab-buttons") as tabs: with gr.TabItem("πŸ† LLM Leaderboard", elem_id="llm-benchmark-table", id=0): with gr.Row(): with gr.Column(): filter_hw = gr.CheckboxGroup(choices=["Gaudi","Xeon","GPU Max","Arc GPU","Core Ultra"], label="Select Training Platform*", elem_id="compute_platforms", value=["Gaudi","Xeon","GPU Max","Arc GPU","Core Ultra"]) filter_platform = gr.CheckboxGroup(choices=["Intel Developer Cloud","AWS","Azure","Google Cloud Platform","Local"], label="Training Infrastructure*", elem_id="training_infra", value=["Intel Developer Cloud","AWS","Azure","Google Cloud Platform","Local"]) filter_affiliation = gr.CheckboxGroup(choices=["No Affiliation","Intel Innovator","Student Ambassador","Intel Liftoff", "Intel Engineering", "Other"], label="Intel Program Affiliation", elem_id="program_affiliation", value=["No Affiliation","Intel Innovator","Student Ambassador","Intel Liftoff", "Intel Engineering", "Other"]) with gr.Column(): filter_size = gr.CheckboxGroup(choices=[1,2,3,5,7,13,35,60,70,100], label="Model Sizes (Billion of Parameters)", elem_id="parameter_size", value=[1,2,3,5,7,13,35,60,70,100]) filter_precision = gr.CheckboxGroup(choices=["fp32","fp16","bf16","int8","fp8", "int4"], label="Model Precision", elem_id="precision", value=["fp32","fp16","bf16","int8","fp8", "int4"]) filter_type = gr.CheckboxGroup(choices=["pretrained","fine-tuned","chat-models","merges/moerges"], label="Model Types", elem_id="model_types", value=["pretrained","fine-tuned","chat-models","merges/moerges"]) inbox_text = gr.CheckboxGroup(label = """Inference Tested Column Legend: 🟨 = Gaudi, 🟦 = Xeon, πŸŸ₯ = GPU Max, 🟠 = Core Ultra, 🟒 = Arc GPU (Please see "❓About" tab for more info)""") # formatting model name and adding links color = '#2f82d4' def make_clickable(row): return f'{row["Model"]}' initial_df = pd.read_csv("./status/leaderboard_status_050124.csv") initial_df["Model"] = initial_df.apply(make_clickable, axis=1) initial_df = initial_df.sort_values(by='Average', ascending=False) def update_df(hw_selected, platform_selected, affiliation_selected, size_selected, precision_selected, type_selected): filtered_df = filter_benchmarks_table(df=initial_df, hw_selected=hw_selected, platform_selected=platform_selected, affiliation_selected=affiliation_selected, size_selected=size_selected, precision_selected=precision_selected, type_selected=type_selected) return filtered_df initial_filtered_df = update_df(["Gaudi","Xeon","GPU Max","Arc GPU","Core Ultra"], ["Intel Developer Cloud","AWS","Azure","Google Cloud Platform","Local"], ["No Affiliation","Intel Innovator","Student Ambassador","Intel Liftoff", "Intel Engineering", "Other"], [1,2,3,5,7,13,35,60,70,100], ["fp32","fp16","bf16","int8","fp8", "int4"], ["pretrained","fine-tuned","chat-models","merges/moerges"]) gradio_df_display = gr.Dataframe(value=initial_filtered_df, headers=["Inference Tested","Model","Average","ARC","HellaSwag","MMLU", "TruthfulQA","Winogrande","Training Hardware","Model Type","Precision", "Size","Infrastructure","Affiliation"], datatype=["html","html","str","str","str","str","str","str","str","str","str","str","str","str"]) filter_hw.change(fn=update_df, inputs=[filter_hw, filter_platform, filter_affiliation, filter_size, filter_precision, filter_type], outputs=[gradio_df_display]) filter_platform.change(fn=update_df, inputs=[filter_hw, filter_platform, filter_affiliation, filter_size, filter_precision, filter_type], outputs=[gradio_df_display]) filter_affiliation.change(fn=update_df, inputs=[filter_hw, filter_platform, filter_affiliation, filter_size, filter_precision, filter_type], outputs=[gradio_df_display]) filter_size.change(fn=update_df, inputs=[filter_hw, filter_platform, filter_affiliation, filter_size, filter_precision, filter_type], outputs=[gradio_df_display]) filter_precision.change(fn=update_df, inputs=[filter_hw, filter_platform, filter_affiliation, filter_size, filter_precision, filter_type], outputs=[gradio_df_display]) filter_type.change(fn=update_df, inputs=[filter_hw, filter_platform, filter_affiliation, filter_size, filter_precision, filter_type], outputs=[gradio_df_display]) with gr.TabItem("🧰 Train a Model", elem_id="getting-started", id=1): gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text") with gr.TabItem("πŸš€ Deployment Tips", elem_id="deployment-tips", id=2): gr.Markdown(DEPLOY_TEXT, elem_classes="markdown-text") with gr.TabItem("πŸ‘©β€πŸ’» Developer Programs", elem_id="hardward-program", id=3): gr.Markdown(PROGRAMS_TEXT, elem_classes="markdown-text") with gr.TabItem("❓ About ", elem_id="about", id=5): gr.Markdown(ABOUT, elem_classes="markdown-text") with gr.TabItem("🏎️ Submit", elem_id="submit", id=4): gr.Markdown(SUBMIT_TEXT, elem_classes="markdown-text") with gr.Row(): gr.Markdown("# Submit Model for Evaluation 🏎️", elem_classes="markdown-text") with gr.Row(): with gr.Column(): model_name_textbox = gr.Textbox(label="Model name", info = """ Name of Model in the Hub. For example: 'Intel/neural-chat-7b-v1-1'""",) revision_name_textbox = gr.Textbox(label="Revision commit (Branch)", placeholder="main") model_type = gr.Dropdown( choices=["pretrained","fine-tuned","chat models","merges/moerges"], label="Model type", multiselect=False, value="pretrained", interactive=True, ) hw_type = gr.Dropdown( choices=["Gaudi","Xeon","GPU Max","Arc GPU","Core Ultra"], label="Training Hardware", multiselect=False, value="Gaudi", interactive=True, ) terms = gr.Checkbox( label="Check if you agree to having your model evaluated and published to the leaderboard by our team.", value=False, interactive=True, ) submit_button = gr.Button("πŸ€— Submit Eval πŸ’»") submission_result = gr.Markdown() with gr.Column(): precision = gr.Dropdown( choices=["fp32","fp16","bf16","int8","fp8", "int4"], label="Precision", multiselect=False, value="fp16", interactive=True, ) weight_type = gr.Dropdown( choices=["Original", "Adapter", "Delta"], label="Weights type", multiselect=False, value="Original", interactive=True, info = """ Select the appropriate weights. If you have fine-tuned or adapted a model with PEFT or Delta-Tuning you likely have LoRA Adapters or Delta Weights.""", ) training_infra = gr.Dropdown( choices=["Intel Developer Cloud","AWS","Azure","Google Cloud Platform","Local"], label="Training Infrastructure", multiselect=False, value="Intel Developer Cloud", interactive=True, info = """ Select the infrastructure that the model was developed on. Local is the ideal choice for Core Ultra, ARC GPUs, and local data center infrastructure.""", ) affiliation = gr.Dropdown( choices=["No Affiliation","Intel Innovator","Student Ambassador","Intel Liftoff", "Intel Engineering", "Other"], label="Affiliation with Intel", multiselect=False, value="No Affiliation", interactive=True, info = """ Select "No Affiliation" if not part of any Intel programs.""", ) base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)") submit_button.click( fn=submit_to_endpoint, inputs=[model_name_textbox, revision_name_textbox, model_type, hw_type, terms, precision, weight_type, training_infra, affiliation, base_model_name_textbox], outputs=submission_result) with gr.Accordion("πŸ“™ Citation", open=False): citation =gr.Textbox(value = CITATION_TEXT, lines=6, label="Use the following to cite this content") gr.Markdown("""

Intel, the Intel logo and Gaudi are trademarks of Intel Corporation or its subsidiaries. *Other names and brands may be claimed as the property of others.

""") demo.queue() demo.launch(share=False)