|
import os |
|
from benchmark.run_benchmark import run_benchmark |
|
|
|
|
|
def run_benchmark_main( |
|
tool_name, |
|
model_name, |
|
num_questions, |
|
openai_api_key, |
|
anthropic_api_key, |
|
openrouter_api_key, |
|
): |
|
"""Run the benchmark using the provided function and API key.""" |
|
|
|
print("Running benchmark for the provided api keys") |
|
|
|
os.system("rm -rf results/*") |
|
|
|
|
|
kwargs = {} |
|
if not num_questions: |
|
num_questions = 10 |
|
kwargs["num_questions"] = num_questions |
|
kwargs["tools"] = [tool_name] |
|
if model_name: |
|
kwargs["model"] = [model_name] |
|
kwargs["api_keys"] = {} |
|
if openai_api_key: |
|
kwargs["api_keys"]["openai"] = openai_api_key |
|
if anthropic_api_key: |
|
kwargs["api_keys"]["anthropic"] = anthropic_api_key |
|
if openrouter_api_key: |
|
kwargs["api_keys"]["openrouter"] = openrouter_api_key |
|
|
|
if "gpt" in model_name: |
|
kwargs["llm_provider"] = "openai" |
|
elif "claude" in model_name: |
|
kwargs["llm_provider"] = "anthropic" |
|
else: |
|
kwargs["llm_provider"] = "openrouter" |
|
|
|
if ( |
|
tool_name == "prediction-request-reasoning" |
|
or tool_name == "prediction-request-rag" |
|
): |
|
if not openai_api_key: |
|
return f"Error: Tools that use RAG also require an OpenAI API Key" |
|
|
|
kwargs["num_urls"] = 3 |
|
kwargs["num_words"] = 300 |
|
kwargs["provide_source_links"] = True |
|
|
|
print(f"Running benchmark") |
|
|
|
|
|
try: |
|
run_benchmark(kwargs=kwargs) |
|
return "completed" |
|
except Exception as e: |
|
return f"Error running benchmark: {e}" |
|
|