import start import gradio as gr import pandas as pd from glob import glob from pathlib import Path from tabs.dashboard import df from tabs.faq import ( about_olas_predict_benchmark, about_olas_predict, about_the_dataset, about_the_tools, ) from tabs.howto_benchmark import how_to_run # Feature temporarily disabled til HF support helps us with the Space Error # from tabs.run_benchmark import run_benchmark_main demo = gr.Blocks() # def run_benchmark_gradio( # tool_name, # model_name, # num_questions, # openai_api_key, # anthropic_api_key, # openrouter_api_key, # ): # """Run the benchmark using inputs.""" # if tool_name is None: # return "Please enter the name of your tool." # if ( # openai_api_key is None # and anthropic_api_key is None # and openrouter_api_key is None # ): # return "Please enter either OpenAI or Anthropic or OpenRouter API key." # result = run_benchmark_main( # tool_name, # model_name, # num_questions, # openai_api_key, # anthropic_api_key, # openrouter_api_key, # ) # if result == "completed": # # get the results file in the results directory # fns = glob("results/*.csv") # print(f"Number of files in results directory: {len(fns)}") # # convert to Path # files = [Path(file) for file in fns] # # get results and summary files # results_files = [file for file in files if "results" in file.name] # # the other file is the summary file # summary_files = [file for file in files if "summary" in file.name] # print(results_files, summary_files) # # get the path with results # results_df = pd.read_csv(results_files[0]) # summary_df = pd.read_csv(summary_files[0]) # # make sure all df float values are rounded to 4 decimal places # results_df = results_df.round(4) # summary_df = summary_df.round(4) # return gr.Dataframe(value=results_df), gr.Dataframe(value=summary_df) # return gr.Textbox( # label="Benchmark Result", value=result, interactive=False # ), gr.Textbox(label="Summary", value="") with demo: gr.HTML("

Olas Predict Benchmark") gr.Markdown( "Leaderboard showing the performance of Olas Predict tools on the Autocast dataset and overview of the project." ) with gr.Tabs() as tabs: # first tab - leaderboard with gr.TabItem("🏅 Benchmark Leaderboard", id=0): gr.components.Dataframe( value=df, ) # second tab - about with gr.TabItem("ℹī¸ About"): with gr.Row(): with gr.Accordion("About the Benchmark", open=False): gr.Markdown(about_olas_predict_benchmark) with gr.Row(): with gr.Accordion("About the Tools", open=False): gr.Markdown(about_the_tools) with gr.Row(): with gr.Accordion("About the Autocast Dataset", open=False): gr.Markdown(about_the_dataset) with gr.Row(): with gr.Accordion("About Olas", open=False): gr.Markdown(about_olas_predict) # third tab - how to run the benchmark with gr.TabItem("🚀 Contribute"): gr.Markdown(how_to_run) # fourth tab - run the benchmark # with gr.TabItem("đŸ”Ĩ Run the Benchmark"): # with gr.Row(): # tool_name = gr.Dropdown( # [ # "prediction-offline", # "prediction-online", # # "prediction-online-summarized-info", # # "prediction-offline-sme", # # "prediction-online-sme", # "prediction-request-rag", # "prediction-request-reasoning", # # "prediction-url-cot-claude", # # "prediction-request-rag-cohere", # # "prediction-with-research-conservative", # # "prediction-with-research-bold", # ], # label="Tool Name", # info="Choose the tool to run", # ) # model_name = gr.Dropdown( # [ # "gpt-3.5-turbo-0125", # "gpt-4-0125-preview", # "claude-3-haiku-20240307", # "claude-3-sonnet-20240229", # "claude-3-opus-20240229", # "databricks/dbrx-instruct:nitro", # "nousresearch/nous-hermes-2-mixtral-8x7b-sft", # # "cohere/command-r-plus", # ], # label="Model Name", # info="Choose the model to use", # ) # with gr.Row(): # openai_api_key = gr.Textbox( # label="OpenAI API Key", # placeholder="Enter your OpenAI API key here", # type="password", # ) # anthropic_api_key = gr.Textbox( # label="Anthropic API Key", # placeholder="Enter your Anthropic API key here", # type="password", # ) # openrouter_api_key = gr.Textbox( # label="OpenRouter API Key", # placeholder="Enter your OpenRouter API key here", # type="password", # ) # with gr.Row(): # num_questions = gr.Slider( # minimum=1, # maximum=340, # value=10, # label="Number of questions to run the benchmark on", # ) # with gr.Row(): # run_button = gr.Button("Run Benchmark") # with gr.Row(): # with gr.Accordion("Results", open=True): # result = gr.Dataframe() # with gr.Row(): # with gr.Accordion("Summary", open=False): # summary = gr.Dataframe() # run_button.click( # run_benchmark_gradio, # inputs=[ # tool_name, # model_name, # num_questions, # openai_api_key, # anthropic_api_key, # openrouter_api_key, # ], # outputs=[result, summary], # ) demo.queue(default_concurrency_limit=40).launch()