Spaces:

valory
/

olas-prediction-leaderboard

Running

App Files Files Community

cyberosa commited on Jul 2, 2024

Commit

16d0da9

1 Parent(s): e3f2881

disabling the run benchmark feature to fix the leaderboard

Browse files

Files changed (4) hide show

.gitmodules +0 -3
app.py +127 -125
olas-predict-benchmark +0 -1
tabs/faq.py +2 -2

.gitmodules DELETED Viewed

@@ -1,3 +0,0 @@
-[submodule "olas-predict-benchmark"]
-	path = olas-predict-benchmark
-	url = https://github.com/valory-xyz/olas-predict-benchmark.git

app.py CHANGED Viewed

@@ -11,69 +11,71 @@ from tabs.faq import (
     about_the_tools,
 )
 from tabs.howto_benchmark import how_to_run
-from tabs.run_benchmark import run_benchmark_main
 demo = gr.Blocks()
-def run_benchmark_gradio(
-    tool_name,
-    model_name,
-    num_questions,
-    openai_api_key,
-    anthropic_api_key,
-    openrouter_api_key,
-):
-    """Run the benchmark using inputs."""
-    if tool_name is None:
-        return "Please enter the name of your tool."
-    if (
-        openai_api_key is None
-        and anthropic_api_key is None
-        and openrouter_api_key is None
-    ):
-        return "Please enter either OpenAI or Anthropic or OpenRouter API key."
-    result = run_benchmark_main(
-        tool_name,
-        model_name,
-        num_questions,
-        openai_api_key,
-        anthropic_api_key,
-        openrouter_api_key,
-    )
-    if result == "completed":
-        # get the results file in the results directory
-        fns = glob("results/*.csv")
-        print(f"Number of files in results directory: {len(fns)}")
-        # convert to Path
-        files = [Path(file) for file in fns]
-        # get results and summary files
-        results_files = [file for file in files if "results" in file.name]
-        # the other file is the summary file
-        summary_files = [file for file in files if "summary" in file.name]
-        print(results_files, summary_files)
-        # get the path with results
-        results_df = pd.read_csv(results_files[0])
-        summary_df = pd.read_csv(summary_files[0])
-        # make sure all df float values are rounded to 4 decimal places
-        results_df = results_df.round(4)
-        summary_df = summary_df.round(4)
-        return gr.Dataframe(value=results_df), gr.Dataframe(value=summary_df)
-    return gr.Textbox(
-        label="Benchmark Result", value=result, interactive=False
-    ), gr.Textbox(label="Summary", value="")
 with demo:
@@ -110,83 +112,83 @@ with demo:
             gr.Markdown(how_to_run)
         # fourth tab - run the benchmark
-        with gr.TabItem("🔥 Run the Benchmark"):
-            with gr.Row():
-                tool_name = gr.Dropdown(
-                    [
-                        "prediction-offline",
-                        "prediction-online",
-                        # "prediction-online-summarized-info",
-                        # "prediction-offline-sme",
-                        # "prediction-online-sme",
-                        "prediction-request-rag",
-                        "prediction-request-reasoning",
-                        # "prediction-url-cot-claude",
-                        # "prediction-request-rag-cohere",
-                        # "prediction-with-research-conservative",
-                        # "prediction-with-research-bold",
-                    ],
-                    label="Tool Name",
-                    info="Choose the tool to run",
-                )
-                model_name = gr.Dropdown(
-                    [
-                        "gpt-3.5-turbo-0125",
-                        "gpt-4-0125-preview",
-                        "claude-3-haiku-20240307",
-                        "claude-3-sonnet-20240229",
-                        "claude-3-opus-20240229",
-                        "databricks/dbrx-instruct:nitro",
-                        "nousresearch/nous-hermes-2-mixtral-8x7b-sft",
-                        # "cohere/command-r-plus",
-                    ],
-                    label="Model Name",
-                    info="Choose the model to use",
-                )
-            with gr.Row():
-                openai_api_key = gr.Textbox(
-                    label="OpenAI API Key",
-                    placeholder="Enter your OpenAI API key here",
-                    type="password",
-                )
-                anthropic_api_key = gr.Textbox(
-                    label="Anthropic API Key",
-                    placeholder="Enter your Anthropic API key here",
-                    type="password",
-                )
-                openrouter_api_key = gr.Textbox(
-                    label="OpenRouter API Key",
-                    placeholder="Enter your OpenRouter API key here",
-                    type="password",
-                )
-            with gr.Row():
-                num_questions = gr.Slider(
-                    minimum=1,
-                    maximum=340,
-                    value=10,
-                    label="Number of questions to run the benchmark on",
-                )
-            with gr.Row():
-                run_button = gr.Button("Run Benchmark")
-            with gr.Row():
-                with gr.Accordion("Results", open=True):
-                    result = gr.Dataframe()
-            with gr.Row():
-                with gr.Accordion("Summary", open=False):
-                    summary = gr.Dataframe()
-            run_button.click(
-                run_benchmark_gradio,
-                inputs=[
-                    tool_name,
-                    model_name,
-                    num_questions,
-                    openai_api_key,
-                    anthropic_api_key,
-                    openrouter_api_key,
-                ],
-                outputs=[result, summary],
-            )
 demo.queue(default_concurrency_limit=40).launch()

     about_the_tools,
 )
 from tabs.howto_benchmark import how_to_run
+# Feature temporarily disabled til HF support helps us with the Space Error
+# from tabs.run_benchmark import run_benchmark_main
 demo = gr.Blocks()
+# def run_benchmark_gradio(
+#     tool_name,
+#     model_name,
+#     num_questions,
+#     openai_api_key,
+#     anthropic_api_key,
+#     openrouter_api_key,
+# ):
+#     """Run the benchmark using inputs."""
+#     if tool_name is None:
+#         return "Please enter the name of your tool."
+#     if (
+#         openai_api_key is None
+#         and anthropic_api_key is None
+#         and openrouter_api_key is None
+#     ):
+#         return "Please enter either OpenAI or Anthropic or OpenRouter API key."
+#     result = run_benchmark_main(
+#         tool_name,
+#         model_name,
+#         num_questions,
+#         openai_api_key,
+#         anthropic_api_key,
+#         openrouter_api_key,
+#     )
+#     if result == "completed":
+#         # get the results file in the results directory
+#         fns = glob("results/*.csv")
+#         print(f"Number of files in results directory: {len(fns)}")
+#         # convert to Path
+#         files = [Path(file) for file in fns]
+#         # get results and summary files
+#         results_files = [file for file in files if "results" in file.name]
+#         # the other file is the summary file
+#         summary_files = [file for file in files if "summary" in file.name]
+#         print(results_files, summary_files)
+#         # get the path with results
+#         results_df = pd.read_csv(results_files[0])
+#         summary_df = pd.read_csv(summary_files[0])
+#         # make sure all df float values are rounded to 4 decimal places
+#         results_df = results_df.round(4)
+#         summary_df = summary_df.round(4)
+#         return gr.Dataframe(value=results_df), gr.Dataframe(value=summary_df)
+#     return gr.Textbox(
+#         label="Benchmark Result", value=result, interactive=False
+#     ), gr.Textbox(label="Summary", value="")
 with demo:
             gr.Markdown(how_to_run)
         # fourth tab - run the benchmark
+        # with gr.TabItem("🔥 Run the Benchmark"):
+        #     with gr.Row():
+        #         tool_name = gr.Dropdown(
+        #             [
+        #                 "prediction-offline",
+        #                 "prediction-online",
+        #                 # "prediction-online-summarized-info",
+        #                 # "prediction-offline-sme",
+        #                 # "prediction-online-sme",
+        #                 "prediction-request-rag",
+        #                 "prediction-request-reasoning",
+        #                 # "prediction-url-cot-claude",
+        #                 # "prediction-request-rag-cohere",
+        #                 # "prediction-with-research-conservative",
+        #                 # "prediction-with-research-bold",
+        #             ],
+        #             label="Tool Name",
+        #             info="Choose the tool to run",
+        #         )
+        #         model_name = gr.Dropdown(
+        #             [
+        #                 "gpt-3.5-turbo-0125",
+        #                 "gpt-4-0125-preview",
+        #                 "claude-3-haiku-20240307",
+        #                 "claude-3-sonnet-20240229",
+        #                 "claude-3-opus-20240229",
+        #                 "databricks/dbrx-instruct:nitro",
+        #                 "nousresearch/nous-hermes-2-mixtral-8x7b-sft",
+        #                 # "cohere/command-r-plus",
+        #             ],
+        #             label="Model Name",
+        #             info="Choose the model to use",
+        #         )
+        #     with gr.Row():
+        #         openai_api_key = gr.Textbox(
+        #             label="OpenAI API Key",
+        #             placeholder="Enter your OpenAI API key here",
+        #             type="password",
+        #         )
+        #         anthropic_api_key = gr.Textbox(
+        #             label="Anthropic API Key",
+        #             placeholder="Enter your Anthropic API key here",
+        #             type="password",
+        #         )
+        #         openrouter_api_key = gr.Textbox(
+        #             label="OpenRouter API Key",
+        #             placeholder="Enter your OpenRouter API key here",
+        #             type="password",
+        #         )
+        #     with gr.Row():
+        #         num_questions = gr.Slider(
+        #             minimum=1,
+        #             maximum=340,
+        #             value=10,
+        #             label="Number of questions to run the benchmark on",
+        #         )
+        #     with gr.Row():
+        #         run_button = gr.Button("Run Benchmark")
+        #     with gr.Row():
+        #         with gr.Accordion("Results", open=True):
+        #             result = gr.Dataframe()
+        #     with gr.Row():
+        #         with gr.Accordion("Summary", open=False):
+        #             summary = gr.Dataframe()
+        #     run_button.click(
+        #         run_benchmark_gradio,
+        #         inputs=[
+        #             tool_name,
+        #             model_name,
+        #             num_questions,
+        #             openai_api_key,
+        #             anthropic_api_key,
+        #             openrouter_api_key,
+        #         ],
+        #         outputs=[result, summary],
+        #     )
 demo.queue(default_concurrency_limit=40).launch()

olas-predict-benchmark DELETED Viewed

	@@ -1 +0,0 @@
1	- Subproject commit cdb77050567ef441e231960cb2a26c20cf09cc30

tabs/faq.py CHANGED Viewed

@@ -10,7 +10,7 @@ However, we can learn about the relative strengths of the different approaches (
 This HF Space showcases the performance of the various models and workflows (called tools in the Olas ecosystem) for making predictions, in terms of accuracy and cost.\
-🤗 Pick a tool and run it on the benchmark using the "🔥 Run the Benchmark" page!
 """
 about_the_tools = """\
@@ -48,4 +48,4 @@ about_olas_predict = """\
 Olas is a network of autonomous services that can run complex logic in a decentralized manner, interacting with on- and off-chain data autonomously and continuously. For other use cases check out [olas.network](https://olas.network/).
 Since 'Olas' means 'waves' in Spanish, it is sometimes referred to as the 'ocean of services' 🌊.
 The project is co-created by [Valory](https://www.valory.xyz/). Valory aspires to enable communities, organizations and countries to co-own AI systems, beginning with decentralized autonomous agents.
-"""

 This HF Space showcases the performance of the various models and workflows (called tools in the Olas ecosystem) for making predictions, in terms of accuracy and cost.\
+🤗 Pick a tool and run it on the benchmark using the "🔥 Run the Benchmark" page! (This feature is temporarily disabled due to an error in HF Spaces)
 """
 about_the_tools = """\
 Olas is a network of autonomous services that can run complex logic in a decentralized manner, interacting with on- and off-chain data autonomously and continuously. For other use cases check out [olas.network](https://olas.network/).
 Since 'Olas' means 'waves' in Spanish, it is sometimes referred to as the 'ocean of services' 🌊.
 The project is co-created by [Valory](https://www.valory.xyz/). Valory aspires to enable communities, organizations and countries to co-own AI systems, beginning with decentralized autonomous agents.
+"""