Spaces:

valory
/

olas-prediction-leaderboard

Running

App Files Files Community

cyberosa commited on Aug 7, 2024

Commit

c6ae410

1 Parent(s): 634798b

activating run benchmark again. Using poetry instead of pip

Browse files

Files changed (3) hide show

app.py +78 -78
start.py +16 -15
tabs/run_benchmark.py +2 -2

app.py CHANGED Viewed

@@ -13,7 +13,7 @@ from tabs.faq import (
 from tabs.howto_benchmark import how_to_run
 # disabling temporarily
-# from tabs.run_benchmark import run_benchmark_main
 demo = gr.Blocks()
@@ -111,83 +111,83 @@ with demo:
             gr.Markdown(how_to_run)
         # fourth tab - run the benchmark
-        # with gr.TabItem("🔥 Run the Benchmark"):
-        #     with gr.Row():
-        #         tool_name = gr.Dropdown(
-        #             [
-        #                 "prediction-offline",
-        #                 "prediction-online",
-        #                 # "prediction-online-summarized-info",
-        #                 # "prediction-offline-sme",
-        #                 # "prediction-online-sme",
-        #                 "prediction-request-rag",
-        #                 "prediction-request-reasoning",
-        #                 # "prediction-url-cot-claude",
-        #                 # "prediction-request-rag-cohere",
-        #                 # "prediction-with-research-conservative",
-        #                 # "prediction-with-research-bold",
-        #             ],
-        #             label="Tool Name",
-        #             info="Choose the tool to run",
-        #         )
-        #         model_name = gr.Dropdown(
-        #             [
-        #                 "gpt-3.5-turbo-0125",
-        #                 "gpt-4-0125-preview",
-        #                 "claude-3-haiku-20240307",
-        #                 "claude-3-sonnet-20240229",
-        #                 "claude-3-opus-20240229",
-        #                 "databricks/dbrx-instruct:nitro",
-        #                 "nousresearch/nous-hermes-2-mixtral-8x7b-sft",
-        #                 # "cohere/command-r-plus",
-        #             ],
-        #             label="Model Name",
-        #             info="Choose the model to use",
-        #         )
-        #     with gr.Row():
-        #         openai_api_key = gr.Textbox(
-        #             label="OpenAI API Key",
-        #             placeholder="Enter your OpenAI API key here",
-        #             type="password",
-        #         )
-        #         anthropic_api_key = gr.Textbox(
-        #             label="Anthropic API Key",
-        #             placeholder="Enter your Anthropic API key here",
-        #             type="password",
-        #         )
-        #         openrouter_api_key = gr.Textbox(
-        #             label="OpenRouter API Key",
-        #             placeholder="Enter your OpenRouter API key here",
-        #             type="password",
-        #         )
-        #     with gr.Row():
-        #         num_questions = gr.Slider(
-        #             minimum=1,
-        #             maximum=340,
-        #             value=10,
-        #             label="Number of questions to run the benchmark on",
-        #         )
-        #     with gr.Row():
-        #         run_button = gr.Button("Run Benchmark")
-        #     with gr.Row():
-        #         with gr.Accordion("Results", open=True):
-        #             result = gr.Dataframe()
-        #     with gr.Row():
-        #         with gr.Accordion("Summary", open=False):
-        #             summary = gr.Dataframe()
-        #     run_button.click(
-        #         run_benchmark_gradio,
-        #         inputs=[
-        #             tool_name,
-        #             model_name,
-        #             num_questions,
-        #             openai_api_key,
-        #             anthropic_api_key,
-        #             openrouter_api_key,
-        #         ],
-        #         outputs=[result, summary],
-        #     )
 demo.queue(default_concurrency_limit=40).launch()

 from tabs.howto_benchmark import how_to_run
 # disabling temporarily
+from tabs.run_benchmark import run_benchmark_main
 demo = gr.Blocks()
             gr.Markdown(how_to_run)
         # fourth tab - run the benchmark
+        with gr.TabItem("🔥 Run the Benchmark"):
+            with gr.Row():
+                tool_name = gr.Dropdown(
+                    [
+                        "prediction-offline",
+                        "prediction-online",
+                        # "prediction-online-summarized-info",
+                        # "prediction-offline-sme",
+                        # "prediction-online-sme",
+                        "prediction-request-rag",
+                        "prediction-request-reasoning",
+                        # "prediction-url-cot-claude",
+                        # "prediction-request-rag-cohere",
+                        # "prediction-with-research-conservative",
+                        # "prediction-with-research-bold",
+                    ],
+                    label="Tool Name",
+                    info="Choose the tool to run",
+                )
+                model_name = gr.Dropdown(
+                    [
+                        "gpt-3.5-turbo-0125",
+                        "gpt-4-0125-preview",
+                        "claude-3-haiku-20240307",
+                        "claude-3-sonnet-20240229",
+                        "claude-3-opus-20240229",
+                        "databricks/dbrx-instruct:nitro",
+                        "nousresearch/nous-hermes-2-mixtral-8x7b-sft",
+                        # "cohere/command-r-plus",
+                    ],
+                    label="Model Name",
+                    info="Choose the model to use",
+                )
+            with gr.Row():
+                openai_api_key = gr.Textbox(
+                    label="OpenAI API Key",
+                    placeholder="Enter your OpenAI API key here",
+                    type="password",
+                )
+                anthropic_api_key = gr.Textbox(
+                    label="Anthropic API Key",
+                    placeholder="Enter your Anthropic API key here",
+                    type="password",
+                )
+                openrouter_api_key = gr.Textbox(
+                    label="OpenRouter API Key",
+                    placeholder="Enter your OpenRouter API key here",
+                    type="password",
+                )
+            with gr.Row():
+                num_questions = gr.Slider(
+                    minimum=1,
+                    maximum=340,
+                    value=10,
+                    label="Number of questions to run the benchmark on",
+                )
+            with gr.Row():
+                run_button = gr.Button("Run Benchmark")
+            with gr.Row():
+                with gr.Accordion("Results", open=True):
+                    result = gr.Dataframe()
+            with gr.Row():
+                with gr.Accordion("Summary", open=False):
+                    summary = gr.Dataframe()
+            run_button.click(
+                run_benchmark_gradio,
+                inputs=[
+                    tool_name,
+                    model_name,
+                    num_questions,
+                    openai_api_key,
+                    anthropic_api_key,
+                    openrouter_api_key,
+                ],
+                outputs=[result, summary],
+            )
 demo.queue(default_concurrency_limit=40).launch()

start.py CHANGED Viewed

@@ -45,25 +45,26 @@ def start():
     """Start commands."""
     print("Starting commands...")
     base_dir = os.getcwd()
-    # olas_dir = os.path.join(base_dir, "olas-predict-benchmark")
-    # mech_dir = os.path.join(olas_dir, "benchmark", "mech")
     commands = [
         ("git submodule init", base_dir),
         ("git submodule update --init --recursive", base_dir),
         ("git submodule update --remote --recursive", base_dir),
-        # (
-        #     'git config remote.origin.fetch "+refs/heads/*:refs/remotes/origin/*"',
-        #     olas_dir,
-        # ),
-        # ("git remote update", olas_dir),
-        # ("git fetch --all", olas_dir),
-        # ("git checkout main", olas_dir),
-        # ("git pull origin main", olas_dir),
-        # ("git checkout main", mech_dir),
-        # ("git pull origin main", mech_dir),
-        # ("pip install -e .", os.path.join(olas_dir, "benchmark")),
-        # ("pip install -e .", mech_dir),
         ("pip install lxml[html_clean]", base_dir),
         ("pip install --upgrade huggingface_hub", base_dir),
     ]
@@ -72,7 +73,7 @@ def start():
         run_command(command, cwd=cwd)
     # add benchmark to the path
-    # sys.path.append(os.path.join(olas_dir, "benchmark"))
     # Download the dataset
     download_dataset()

     """Start commands."""
     print("Starting commands...")
     base_dir = os.getcwd()
+    olas_dir = os.path.join(base_dir, "olas-predict-benchmark")
+    benchmark_dir = os.path.join(olas_dir, "benchmark")
+    mech_dir = os.path.join(olas_dir, "benchmark", "mech")
     commands = [
         ("git submodule init", base_dir),
         ("git submodule update --init --recursive", base_dir),
         ("git submodule update --remote --recursive", base_dir),
+        (
+            'git config remote.origin.fetch "+refs/heads/*:refs/remotes/origin/*"',
+            olas_dir,
+        ),
+        ("git remote update", olas_dir),
+        ("git fetch --all", olas_dir),
+        ("git checkout main", olas_dir),
+        ("git pull origin main", olas_dir),
+        ("git checkout main", mech_dir),
+        ("git pull origin main", mech_dir),
+        ("poetry install", benchmark_dir),
+        ("pip install -e .", mech_dir),
         ("pip install lxml[html_clean]", base_dir),
         ("pip install --upgrade huggingface_hub", base_dir),
     ]
         run_command(command, cwd=cwd)
     # add benchmark to the path
+    sys.path.append(os.path.join(olas_dir, "benchmark"))
     # Download the dataset
     download_dataset()

tabs/run_benchmark.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import os
-# from benchmark.run_benchmark import run_benchmark
 def run_benchmark_main(
@@ -55,7 +55,7 @@ def run_benchmark_main(
     # Run the benchmark
     try:
-        # run_benchmark(kwargs=kwargs)
         return "completed"
     except Exception as e:
         return f"Error running benchmark: {e}"

 import os
+from benchmark.run_benchmark import run_benchmark
 def run_benchmark_main(
     # Run the benchmark
     try:
+        run_benchmark(kwargs=kwargs)
         return "completed"
     except Exception as e:
         return f"Error running benchmark: {e}"