Spaces:
Sleeping
Sleeping
File size: 8,441 Bytes
7f5506e 06c0fa4 6e44082 7f5506e d7de3ad 31c57c2 d7de3ad 31c57c2 9252209 5312397 d7de3ad 9252209 d7de3ad 6e44082 d7de3ad 31c57c2 6e44082 31c57c2 6e44082 31c57c2 6e44082 31c57c2 80d548a 9252209 d7de3ad 9252209 d7de3ad 7f5506e d7de3ad 9252209 d7de3ad 7f5506e 5312397 d7de3ad 6e44082 d7de3ad 6e44082 d7de3ad 6e44082 2a8dc61 d7de3ad 9252209 d7de3ad 9252209 31c57c2 9252209 31c57c2 d7de3ad 7b35424 d7de3ad 5f4c2c0 d7de3ad 0b4b222 d7de3ad 7f5506e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 |
import gradio as gr
import time
from apscheduler.schedulers.background import BackgroundScheduler
import threading
import globals
from utils.io import save_results, load_results, load_models_providers, get_results_table, load_models_providers_str, get_summary_stats
from utils.jobs import run_single_job, run_multiple_jobs, launch_jobs, update_job_statuses, relaunch_failed_jobs
from typing import List, Optional
def status_monitor() -> None:
"""Background thread to monitor job statuses."""
while True:
update_job_statuses()
time.sleep(240) # Check every 30 seconds
def daily_checkpoint() -> None:
"""Daily checkpoint - save current state."""
print("Daily checkpoint - saving current state")
save_results()
# Create Gradio interface
def create_app() -> gr.Blocks:
with gr.Blocks(title="Inference Provider Testing Dashboard") as demo:
with gr.Tab("Main"):
gr.Markdown("# Inference Provider Testing Dashboard")
gr.Markdown("Launch and monitor evaluation jobs for multiple models and providers.")
# Manual job launch section
with gr.Row():
with gr.Column(scale=2):
model_input = gr.Textbox(
label="Model",
placeholder="e.g., meta-llama/Llama-3.3-70B-Instruct",
info="Enter HuggingFace model ID"
)
with gr.Column(scale=1):
provider_input = gr.Textbox(
label="Provider",
placeholder="e.g., together-ai",
info="Enter inference provider name"
)
with gr.Column(scale=1):
launch_single_btn = gr.Button("Launch Job", variant="primary")
# Batch action buttons
with gr.Row():
launch_btn = gr.Button("Launch All Jobs", variant="secondary", scale=2)
relaunch_failed_btn = gr.Button("Relaunch Failed", variant="stop", scale=1)
refresh_btn = gr.Button("π Refresh", variant="secondary", scale=1)
output = gr.Textbox(label="Status", interactive=False)
# Summary statistics
summary_stats = gr.Markdown(value=get_summary_stats())
with gr.Row():
with gr.Column():
gr.Markdown("## Job Results")
results_table = gr.Dataframe(
value=get_results_table(),
interactive=True,
show_search="search",
show_copy_button=True,
show_fullscreen_button=True,
wrap=True,
static_columns=list(range(11)),
datatype=["str", "str", "str", "str", "str", "str", "str", "str", "str", "str", "html", "str"],
elem_id="results_table"
)
# Event handlers
def launch_single_and_update(model: str, provider: str):
"""Launch multiple jobs for a model-provider combination and return updated table and stats."""
if not model or not provider:
return "β Please provide both model and provider", get_results_table(), get_summary_stats()
job_ids = run_multiple_jobs(model, provider, globals.TASKS)
if not job_ids:
return "β Failed to launch jobs (may already be running)", get_results_table(), get_summary_stats()
save_results()
return f"β
Launched {len(job_ids)} jobs for {model} on {provider}", get_results_table(), get_summary_stats()
launch_single_btn.click(
fn=launch_single_and_update,
inputs=[model_input, provider_input],
outputs=[output, results_table, summary_stats]
)
def launch_and_update():
"""Launch jobs and return updated table and stats."""
result = launch_jobs()
return result, get_results_table(), get_summary_stats()
def relaunch_and_update():
"""Relaunch failed jobs and return updated table and stats."""
result = relaunch_failed_jobs()
return result, get_results_table(), get_summary_stats()
launch_btn.click(
fn=launch_and_update,
outputs=[output, results_table, summary_stats]
)
relaunch_failed_btn.click(
fn=relaunch_and_update,
outputs=[output, results_table, summary_stats]
)
def refresh_display():
"""Refresh the table and stats display."""
return get_results_table(), get_summary_stats()
refresh_btn.click(
fn=refresh_display,
outputs=[results_table, summary_stats]
)
# Handle dataframe cell selection for relaunch
def handle_table_select(evt: gr.SelectData):
"""Handle when a cell in the results table is clicked."""
print(f"[Relaunch] Cell selected - Row: {evt.index[0]}, Col: {evt.index[1]}, Value: {evt.value}")
# If we selected a "rerun" cell, we relaunch a job
if evt.index[1] == 11:
# Get the full row data from the dataframe
df = get_results_table()
row_data = df.data.iloc[evt.index[0]]
model = row_data['Model']
provider = row_data['Provider']
print(f"[Relaunch] Relaunching {globals.NUM_RUNS_PER_JOB} jobs - Model: {model}, Provider: {provider}")
run_multiple_jobs(model, provider, globals.TASKS)
# Save after relaunch
save_results()
# Then update the table and stats
return get_results_table(), get_summary_stats()
results_table.select(
fn=handle_table_select,
inputs=[],
outputs=[results_table, summary_stats]
)
# Auto-refresh table and stats every 30 seconds
def auto_refresh():
"""Auto-refresh table and summary stats."""
return get_results_table(), get_summary_stats()
# Create a timer for auto-refresh
timer = gr.Timer(value=30, active=True)
timer.tick(
fn=auto_refresh,
inputs=[],
outputs=[results_table, summary_stats]
)
with gr.Tab("About"):
gr.Markdown("""
In this demo, we run 10 samples of 3 evaluations: ifeval (instruction following), gsm_plus (grade school math problems, less contaminated than gsm8k) and gpqa, diamond subset (knowledge), with `lighteval`, `inference-providers` and `jobs`.
The "status" column indicates whether the evaluation failed completely (usually because of the provider was down or because we were rate limited).
To run any of these locally, you can use the following
```python
from huggingface_hub import run_job, inspect_job, whoami
job = run_job(
image="hf.co/spaces/OpenEvals/EvalsOnTheHub",
command=[
"lighteval", "endpoint", "inference-providers",
"model_name=MODEL,provider=PROVIDER",
"extended|ifeval|0,lighteval|gpqa:diamond|0",
"--push-to-hub", "--save-details",
"--results-org", "YOURORG"
],
namespace="huggingface",
secrets={"HF_TOKEN": YOURTOKEN},
token=YOURTOKEN
)
```
""")
return demo
if __name__ == "__main__":
# Load previous results
load_results()
print("Starting Inference Provider Testing Dashboard")
# Start status monitor thread
monitor_thread = threading.Thread(target=status_monitor, daemon=True)
monitor_thread.start()
print("Job status monitor started")
# Start APScheduler for daily checkpoint
scheduler = BackgroundScheduler()
scheduler.add_job(daily_checkpoint, 'cron', hour=0, minute=0) # Run at midnight
scheduler.start()
print("Daily checkpoint scheduler started (saves at 00:00)")
# Create and launch the Gradio interface
demo = create_app()
demo.launch(server_name="0.0.0.0", server_port=7860)
|