Spaces:

valory
/

olas-prediction-leaderboard

Running

App Files Files Community

richardblythman commited on Apr 17, 2024

Commit

7d57619

1 Parent(s): 6bec1f5

add open source lms

Browse files

Files changed (2) hide show

app.py +16 -13
tabs/run_benchmark.py +4 -2

app.py CHANGED Viewed

@@ -17,14 +17,14 @@ from tabs.run_benchmark import run_benchmark_main
 demo = gr.Blocks()
-def run_benchmark_gradio(tool_name, model_name, num_questions, openai_api_key, anthropic_api_key):
     """Run the benchmark using inputs."""
     if tool_name is None:
         return "Please enter the name of your tool."
-    if openai_api_key is None and anthropic_api_key is None:
-        return "Please enter either OpenAI or Anthropic API key."
-    result = run_benchmark_main(tool_name, model_name, num_questions, openai_api_key, anthropic_api_key)
     if result == 'completed':
         # get the results file in the results directory
         fns = glob('results/*.csv')
@@ -101,16 +101,15 @@ with demo:
                     [
                         "prediction-offline",
                         "prediction-online",
-                        "prediction-offline-sme",
-                        "prediction-online-sme",
-                        "claude-prediction-offline",
-                        "claude-prediction-online",
                         'prediction-request-rag',
-                        "prediction-with-research-conservative",
-                        "prediction-with-research-bold",
-                        "prediction-request-reasoning-claude",
-                        "prediction-request-rag-claude",
                         "prediction-url-cot-claude",
                     ], label="Tool Name", info="Choose the tool to run")
                 model_name = gr.Dropdown([
                     "gpt-3.5-turbo-0125",
@@ -118,10 +117,14 @@ with demo:
                     "claude-3-haiku-20240307",
                     "claude-3-sonnet-20240229",
                     "claude-3-opus-20240229",
                 ], label="Model Name", info="Choose the model to use")
             with gr.Row():
                 openai_api_key = gr.Textbox(label="OpenAI API Key", placeholder="Enter your OpenAI API key here", type="password")
                 anthropic_api_key = gr.Textbox(label="Anthropic API Key", placeholder="Enter your Anthropic API key here", type="password")
             with gr.Row():
                 num_questions = gr.Slider(
                                     minimum=1,
@@ -139,7 +142,7 @@ with demo:
                     summary = gr.Dataframe()
             run_button.click(run_benchmark_gradio,
-                            inputs=[tool_name, model_name, num_questions, openai_api_key, anthropic_api_key],
                             outputs=[result, summary])
 demo.queue(default_concurrency_limit=40).launch()

 demo = gr.Blocks()
+def run_benchmark_gradio(tool_name, model_name, num_questions, openai_api_key, anthropic_api_key, openrouter_api_key):
     """Run the benchmark using inputs."""
     if tool_name is None:
         return "Please enter the name of your tool."
+    if openai_api_key is None and anthropic_api_key is None and openrouter_api_key is None:
+        return "Please enter either OpenAI or Anthropic or OpenRouter API key."
+    result = run_benchmark_main(tool_name, model_name, num_questions, openai_api_key, anthropic_api_key, openrouter_api_key)
     if result == 'completed':
         # get the results file in the results directory
         fns = glob('results/*.csv')
                     [
                         "prediction-offline",
                         "prediction-online",
+                        # "prediction-online-summarized-info",
+                        # "prediction-offline-sme",
+                        # "prediction-online-sme",
                         'prediction-request-rag',
+                        'prediction-request-reasoning',
                         "prediction-url-cot-claude",
+                        # "prediction-request-rag-cohere",
+                        # "prediction-with-research-conservative",
+                        # "prediction-with-research-bold",
                     ], label="Tool Name", info="Choose the tool to run")
                 model_name = gr.Dropdown([
                     "gpt-3.5-turbo-0125",
                     "claude-3-haiku-20240307",
                     "claude-3-sonnet-20240229",
                     "claude-3-opus-20240229",
+                    "databricks/dbrx-instruct:nitro",
+                    "nousresearch/nous-hermes-2-mixtral-8x7b-sft",
+                    # "cohere/command-r-plus",
                 ], label="Model Name", info="Choose the model to use")
             with gr.Row():
                 openai_api_key = gr.Textbox(label="OpenAI API Key", placeholder="Enter your OpenAI API key here", type="password")
                 anthropic_api_key = gr.Textbox(label="Anthropic API Key", placeholder="Enter your Anthropic API key here", type="password")
+                openrouter_api_key = gr.Textbox(label="OpenRouter API Key", placeholder="Enter your OpenRouter API key here", type="password")
             with gr.Row():
                 num_questions = gr.Slider(
                                     minimum=1,
                     summary = gr.Dataframe()
             run_button.click(run_benchmark_gradio,
+                            inputs=[tool_name, model_name, num_questions, openai_api_key, anthropic_api_key, openrouter_api_key],
                             outputs=[result, summary])
 demo.queue(default_concurrency_limit=40).launch()

tabs/run_benchmark.py CHANGED Viewed

@@ -2,7 +2,7 @@ import os
 from benchmark.run_benchmark import run_benchmark
-def run_benchmark_main(tool_name, model_name, num_questions, openai_api_key, anthropic_api_key):
     """Run the benchmark using the provided function and API key."""
     # Empyt the results directory
     os.system("rm -rf results/*")
@@ -20,7 +20,9 @@ def run_benchmark_main(tool_name, model_name, num_questions, openai_api_key, ant
         kwargs["api_keys"]["openai"] = openai_api_key
     if anthropic_api_key:
         kwargs["api_keys"]["anthropic"] = anthropic_api_key
     kwargs["num_urls"] = 3
     kwargs["num_words"] = 300
     kwargs["provide_source_links"] = True

 from benchmark.run_benchmark import run_benchmark
+def run_benchmark_main(tool_name, model_name, num_questions, openai_api_key, anthropic_api_key, openrouter_api_key):
     """Run the benchmark using the provided function and API key."""
     # Empyt the results directory
     os.system("rm -rf results/*")
         kwargs["api_keys"]["openai"] = openai_api_key
     if anthropic_api_key:
         kwargs["api_keys"]["anthropic"] = anthropic_api_key
+    if openrouter_api_key:
+        kwargs["api_keys"]["openrouter"] = openrouter_api_key
     kwargs["num_urls"] = 3
     kwargs["num_words"] = 300
     kwargs["provide_source_links"] = True