Spaces:

valory
/

olas-prediction-leaderboard

Runtime error

App Files Files Community

cyberosa commited on Jul 1

Commit

ac62b55

•

1 Parent(s): af4f5ae

Prints and formatting

Browse files

Files changed (3) hide show

app.py +89 -39
tabs/dashboard.py +3 -1
tabs/run_benchmark.py +15 -5

app.py CHANGED Viewed

@@ -8,7 +8,7 @@ from tabs.faq import (
     about_olas_predict_benchmark,
     about_olas_predict,
     about_the_dataset,
-    about_the_tools
 )
 from tabs.howto_benchmark import how_to_run
 from tabs.run_benchmark import run_benchmark_main
@@ -17,17 +17,36 @@ from tabs.run_benchmark import run_benchmark_main
 demo = gr.Blocks()
-def run_benchmark_gradio(tool_name, model_name, num_questions, openai_api_key, anthropic_api_key, openrouter_api_key):
     """Run the benchmark using inputs."""
     if tool_name is None:
         return "Please enter the name of your tool."
-    if openai_api_key is None and anthropic_api_key is None and openrouter_api_key is None:
         return "Please enter either OpenAI or Anthropic or OpenRouter API key."
-    result = run_benchmark_main(tool_name, model_name, num_questions, openai_api_key, anthropic_api_key, openrouter_api_key)
-    if result == 'completed':
         # get the results file in the results directory
-        fns = glob('results/*.csv')
         print(f"Number of files in results directory: {len(fns)}")
@@ -35,10 +54,10 @@ def run_benchmark_gradio(tool_name, model_name, num_questions, openai_api_key, a
         files = [Path(file) for file in fns]
         # get results and summary files
-        results_files = [file for file in files if 'results' in file.name]
         # the other file is the summary file
-        summary_files = [file for file in files if 'summary' in file.name]
         print(results_files, summary_files)
@@ -51,13 +70,17 @@ def run_benchmark_gradio(tool_name, model_name, num_questions, openai_api_key, a
         summary_df = summary_df.round(4)
         return gr.Dataframe(value=results_df), gr.Dataframe(value=summary_df)
-    return gr.Textbox(label="Benchmark Result", value=result, interactive=False), gr.Textbox(label="Summary", value="")
 with demo:
     gr.HTML("<h1>Olas Predict Benchmark</hjson>")
-    gr.Markdown("Leaderboard showing the performance of Olas Predict tools on the Autocast dataset and overview of the project.")
     with gr.Tabs() as tabs:
         # first tab - leaderboard
@@ -82,7 +105,6 @@ with demo:
                 with gr.Accordion("About Olas", open=False):
                     gr.Markdown(about_olas_predict)
         # third tab - how to run the benchmark
         with gr.TabItem("🚀 Contribute"):
             gr.Markdown(how_to_run)
@@ -97,34 +119,53 @@ with demo:
                         # "prediction-online-summarized-info",
                         # "prediction-offline-sme",
                         # "prediction-online-sme",
-                        'prediction-request-rag',
-                        'prediction-request-reasoning',
                         # "prediction-url-cot-claude",
                         # "prediction-request-rag-cohere",
                         # "prediction-with-research-conservative",
                         # "prediction-with-research-bold",
-                    ], label="Tool Name", info="Choose the tool to run")
-                model_name = gr.Dropdown([
-                    "gpt-3.5-turbo-0125",
-                    "gpt-4-0125-preview",
-                    "claude-3-haiku-20240307",
-                    "claude-3-sonnet-20240229",
-                    "claude-3-opus-20240229",
-                    "databricks/dbrx-instruct:nitro",
-                    "nousresearch/nous-hermes-2-mixtral-8x7b-sft",
-                    # "cohere/command-r-plus",
-                ], label="Model Name", info="Choose the model to use")
             with gr.Row():
-                openai_api_key = gr.Textbox(label="OpenAI API Key", placeholder="Enter your OpenAI API key here", type="password")
-                anthropic_api_key = gr.Textbox(label="Anthropic API Key", placeholder="Enter your Anthropic API key here", type="password")
-                openrouter_api_key = gr.Textbox(label="OpenRouter API Key", placeholder="Enter your OpenRouter API key here", type="password")
             with gr.Row():
                 num_questions = gr.Slider(
-                                    minimum=1,
-                                    maximum=340,
-                                    value=10,
-                                    label="Number of questions to run the benchmark on",
-                                )
             with gr.Row():
                 run_button = gr.Button("Run Benchmark")
             with gr.Row():
@@ -133,10 +174,19 @@ with demo:
             with gr.Row():
                 with gr.Accordion("Summary", open=False):
                     summary = gr.Dataframe()
-            run_button.click(run_benchmark_gradio,
-                            inputs=[tool_name, model_name, num_questions, openai_api_key, anthropic_api_key, openrouter_api_key],
-                            outputs=[result, summary])
-demo.queue(default_concurrency_limit=40).launch()

     about_olas_predict_benchmark,
     about_olas_predict,
     about_the_dataset,
+    about_the_tools,
 )
 from tabs.howto_benchmark import how_to_run
 from tabs.run_benchmark import run_benchmark_main
 demo = gr.Blocks()
+def run_benchmark_gradio(
+    tool_name,
+    model_name,
+    num_questions,
+    openai_api_key,
+    anthropic_api_key,
+    openrouter_api_key,
+):
     """Run the benchmark using inputs."""
     if tool_name is None:
         return "Please enter the name of your tool."
+    if (
+        openai_api_key is None
+        and anthropic_api_key is None
+        and openrouter_api_key is None
+    ):
         return "Please enter either OpenAI or Anthropic or OpenRouter API key."
+    result = run_benchmark_main(
+        tool_name,
+        model_name,
+        num_questions,
+        openai_api_key,
+        anthropic_api_key,
+        openrouter_api_key,
+    )
+    if result == "completed":
         # get the results file in the results directory
+        fns = glob("results/*.csv")
         print(f"Number of files in results directory: {len(fns)}")
         files = [Path(file) for file in fns]
         # get results and summary files
+        results_files = [file for file in files if "results" in file.name]
         # the other file is the summary file
+        summary_files = [file for file in files if "summary" in file.name]
         print(results_files, summary_files)
         summary_df = summary_df.round(4)
         return gr.Dataframe(value=results_df), gr.Dataframe(value=summary_df)
+    return gr.Textbox(
+        label="Benchmark Result", value=result, interactive=False
+    ), gr.Textbox(label="Summary", value="")
 with demo:
     gr.HTML("<h1>Olas Predict Benchmark</hjson>")
+    gr.Markdown(
+        "Leaderboard showing the performance of Olas Predict tools on the Autocast dataset and overview of the project."
+    )
     with gr.Tabs() as tabs:
         # first tab - leaderboard
                 with gr.Accordion("About Olas", open=False):
                     gr.Markdown(about_olas_predict)
         # third tab - how to run the benchmark
         with gr.TabItem("🚀 Contribute"):
             gr.Markdown(how_to_run)
                         # "prediction-online-summarized-info",
                         # "prediction-offline-sme",
                         # "prediction-online-sme",
+                        "prediction-request-rag",
+                        "prediction-request-reasoning",
                         # "prediction-url-cot-claude",
                         # "prediction-request-rag-cohere",
                         # "prediction-with-research-conservative",
                         # "prediction-with-research-bold",
+                    ],
+                    label="Tool Name",
+                    info="Choose the tool to run",
+                )
+                model_name = gr.Dropdown(
+                    [
+                        "gpt-3.5-turbo-0125",
+                        "gpt-4-0125-preview",
+                        "claude-3-haiku-20240307",
+                        "claude-3-sonnet-20240229",
+                        "claude-3-opus-20240229",
+                        "databricks/dbrx-instruct:nitro",
+                        "nousresearch/nous-hermes-2-mixtral-8x7b-sft",
+                        # "cohere/command-r-plus",
+                    ],
+                    label="Model Name",
+                    info="Choose the model to use",
+                )
             with gr.Row():
+                openai_api_key = gr.Textbox(
+                    label="OpenAI API Key",
+                    placeholder="Enter your OpenAI API key here",
+                    type="password",
+                )
+                anthropic_api_key = gr.Textbox(
+                    label="Anthropic API Key",
+                    placeholder="Enter your Anthropic API key here",
+                    type="password",
+                )
+                openrouter_api_key = gr.Textbox(
+                    label="OpenRouter API Key",
+                    placeholder="Enter your OpenRouter API key here",
+                    type="password",
+                )
             with gr.Row():
                 num_questions = gr.Slider(
+                    minimum=1,
+                    maximum=340,
+                    value=10,
+                    label="Number of questions to run the benchmark on",
+                )
             with gr.Row():
                 run_button = gr.Button("Run Benchmark")
             with gr.Row():
             with gr.Row():
                 with gr.Accordion("Summary", open=False):
                     summary = gr.Dataframe()
+            run_button.click(
+                run_benchmark_gradio,
+                inputs=[
+                    tool_name,
+                    model_name,
+                    num_questions,
+                    openai_api_key,
+                    anthropic_api_key,
+                    openrouter_api_key,
+                ],
+                outputs=[result, summary],
+            )
+demo.queue(default_concurrency_limit=40).launch()

tabs/dashboard.py CHANGED Viewed

@@ -3,8 +3,10 @@ import pandas as pd
 csv_file_path = "formatted_data.csv"
 def return_df():
     # Reading the CSV file
     df = pd.read_csv(csv_file_path)
     # all floats to be rounded to 2 decimal places
@@ -12,4 +14,4 @@ def return_df():
     return df
-df = return_df()

 csv_file_path = "formatted_data.csv"
 def return_df():
     # Reading the CSV file
+    print("Reading csv file with results")
     df = pd.read_csv(csv_file_path)
     # all floats to be rounded to 2 decimal places
     return df
+df = return_df()

tabs/run_benchmark.py CHANGED Viewed

@@ -2,8 +2,17 @@ import os
 from benchmark.run_benchmark import run_benchmark
-def run_benchmark_main(tool_name, model_name, num_questions, openai_api_key, anthropic_api_key, openrouter_api_key):
     """Run the benchmark using the provided function and API key."""
     # Empyt the results directory
     os.system("rm -rf results/*")
@@ -30,7 +39,10 @@ def run_benchmark_main(tool_name, model_name, num_questions, openai_api_key, ant
     else:
         kwargs["llm_provider"] = "openrouter"
-    if tool_name == "prediction-request-reasoning" or tool_name == "prediction-request-rag":
         if not openai_api_key:
             return f"Error: Tools that use RAG also require an OpenAI API Key"
@@ -39,12 +51,10 @@ def run_benchmark_main(tool_name, model_name, num_questions, openai_api_key, ant
     kwargs["provide_source_links"] = True
     print(f"Running benchmark")
     # Run the benchmark
     try:
         run_benchmark(kwargs=kwargs)
         return "completed"
     except Exception as e:
         return f"Error running benchmark: {e}"

 from benchmark.run_benchmark import run_benchmark
+def run_benchmark_main(
+    tool_name,
+    model_name,
+    num_questions,
+    openai_api_key,
+    anthropic_api_key,
+    openrouter_api_key,
+):
     """Run the benchmark using the provided function and API key."""
+    print("Running benchmark for the provided api keys")
     # Empyt the results directory
     os.system("rm -rf results/*")
     else:
         kwargs["llm_provider"] = "openrouter"
+    if (
+        tool_name == "prediction-request-reasoning"
+        or tool_name == "prediction-request-rag"
+    ):
         if not openai_api_key:
             return f"Error: Tools that use RAG also require an OpenAI API Key"
     kwargs["provide_source_links"] = True
     print(f"Running benchmark")
     # Run the benchmark
     try:
         run_benchmark(kwargs=kwargs)
         return "completed"
     except Exception as e:
         return f"Error running benchmark: {e}"