import runpod import gradio as gr GPU_LIST = ["NVIDIA A100 80GB PCIe", "NVIDIA A100-SXM4-80GB", "NVIDIA A30", "NVIDIA A40", "NVIDIA GeForce RTX 3070", "NVIDIA GeForce RTX 3080", "NVIDIA GeForce RTX 3080 Ti", "NVIDIA GeForce RTX 3090", "NVIDIA GeForce RTX 3090 Ti", "NVIDIA GeForce RTX 4070 Ti", "NVIDIA GeForce RTX 4080", "NVIDIA GeForce RTX 4090", "NVIDIA H100 80GB HBM3", "NVIDIA H100 PCIe", "NVIDIA L4", "NVIDIA L40", "NVIDIA RTX 4000 Ada Generation", "NVIDIA RTX 4000 SFF Ada Generation", "NVIDIA RTX 5000 Ada Generation", "NVIDIA RTX 6000 Ada Generation", "NVIDIA RTX A2000", "NVIDIA RTX A4000", "NVIDIA RTX A4500", "NVIDIA RTX A5000", "NVIDIA RTX A6000", "Tesla V100-FHHL-16GB", "Tesla V100-PCIE-16GB", "Tesla V100-SXM2-16GB", "Tesla V100-SXM2-32GB"] TITLE = """

🧐 LLM AutoEval

💻 GitHub • 📝 Colab notebook

Automatically evaluate your LLMs using RunPod. If you don't have an account, please consider using my referral link.

Once a pod has started, you can safely close this tab. The results are then privately uploaded to GitHub Gist, and the pod is automatically destroyed.

""" def autoeval(BENCHMARK, MODEL_ID, GPU, NUMBER_OF_GPUS, CONTAINER_DISK, CLOUD_TYPE, REPO, TRUST_REMOTE_CODE, DEBUG, GITHUB_API_TOKEN, RUNPOD_TOKEN): runpod.api_key = RUNPOD_TOKEN pod = runpod.create_pod( name=f"Eval {MODEL_ID.split('/')[-1]} on {BENCHMARK.capitalize()}", image_name="runpod/pytorch:2.0.1-py3.10-cuda11.8.0-devel-ubuntu22.04", gpu_type_id=GPU, cloud_type=CLOUD_TYPE, gpu_count=NUMBER_OF_GPUS, volume_in_gb=0, container_disk_in_gb=CONTAINER_DISK, template_id="au6nz6emhk", env={ "BENCHMARK": BENCHMARK, "MODEL_ID": MODEL_ID, "REPO": REPO, "TRUST_REMOTE_CODE": TRUST_REMOTE_CODE, "DEBUG": DEBUG, "GITHUB_API_TOKEN": GITHUB_API_TOKEN, } ) return "Evaluation started!" with gr.Blocks() as demo: gr.HTML(TITLE) inputs = [ gr.Dropdown(["nous", "openllm"], label="Benchmark", info="Select your benchmark suite", value="nous"), gr.Textbox("", label="Model", value="mlabonne/NeuralBeagle14-7B", info="ID of the model you want to evaluate", placeholder="mlabonne/NeuralBeagle14-7B"), gr.Dropdown(GPU_LIST, label="GPU", value="NVIDIA GeForce RTX 3090", info="Select your GPU to run the evaluation"), gr.Slider(minimum=1, maximum=8, value=1, step=1, label="Number of GPUs", info="Number of GPUs to use"), gr.Slider(minimum=50, maximum=500, value=75, step=25, label="Container disk", info="Size of the container disk in GB"), gr.Dropdown(["COMMUNITY", "SECURE"], value="COMMUNITY", label="Cloud type", info="Select your cloud type"), gr.Textbox("https://github.com/mlabonne/llm-autoeval.git", label="LLM AutoEval repo", info="Link to your LLM AutoEval repo"), gr.Checkbox(label="Trust remote code", value=False, info="Required for some models like phi-2"), gr.Checkbox(label="Debug", value=False, info="Don't kill the pod after evaluation if activated"), gr.Textbox("", label="Github API Token", info="Your Github API token", placeholder="hf_XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"), gr.Textbox("", label="Runpod API Token", info="Your Runpod API token", placeholder="XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"), ] btn = gr.Button("Evaluate!") outputs = gr.Textbox(label="Output", autofocus=True) gr.HTML('

→ Find your pods: https://www.runpod.io/console/pods

') btn.click(autoeval, inputs, outputs) demo.launch()