File size: 8,441 Bytes
7f5506e
 
 
 
 
06c0fa4
6e44082
7f5506e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d7de3ad
 
 
 
31c57c2
d7de3ad
31c57c2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9252209
5312397
d7de3ad
 
 
9252209
 
 
d7de3ad
 
 
 
 
 
 
 
 
 
6e44082
 
d7de3ad
 
 
 
 
31c57c2
6e44082
31c57c2
 
 
6e44082
 
 
31c57c2
 
6e44082
31c57c2
 
 
 
 
80d548a
 
9252209
 
 
 
 
 
 
 
 
 
d7de3ad
9252209
 
d7de3ad
7f5506e
d7de3ad
9252209
 
d7de3ad
7f5506e
5312397
 
 
 
 
 
 
 
 
d7de3ad
 
 
 
 
 
6e44082
d7de3ad
 
 
 
 
 
6e44082
d7de3ad
6e44082
 
2a8dc61
d7de3ad
9252209
 
d7de3ad
 
 
 
9252209
 
 
 
 
 
 
 
31c57c2
 
 
9252209
 
31c57c2
d7de3ad
 
 
7b35424
d7de3ad
5f4c2c0
 
d7de3ad
 
 
 
 
 
 
 
0b4b222
d7de3ad
 
 
 
 
 
 
 
 
7f5506e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
import gradio as gr
import time
from apscheduler.schedulers.background import BackgroundScheduler
import threading
import globals
from utils.io import save_results, load_results, load_models_providers, get_results_table, load_models_providers_str, get_summary_stats
from utils.jobs import run_single_job, run_multiple_jobs, launch_jobs, update_job_statuses, relaunch_failed_jobs
from typing import List, Optional


def status_monitor() -> None:
    """Background thread to monitor job statuses."""
    while True:
        update_job_statuses()
        time.sleep(240)  # Check every 30 seconds


def daily_checkpoint() -> None:
    """Daily checkpoint - save current state."""
    print("Daily checkpoint - saving current state")
    save_results()


# Create Gradio interface
def create_app() -> gr.Blocks:
    with gr.Blocks(title="Inference Provider Testing Dashboard") as demo:
        with gr.Tab("Main"):
            gr.Markdown("# Inference Provider Testing Dashboard")
            gr.Markdown("Launch and monitor evaluation jobs for multiple models and providers.")

            # Manual job launch section
            with gr.Row():
                with gr.Column(scale=2):
                    model_input = gr.Textbox(
                        label="Model",
                        placeholder="e.g., meta-llama/Llama-3.3-70B-Instruct",
                        info="Enter HuggingFace model ID"
                    )
                with gr.Column(scale=1):
                    provider_input = gr.Textbox(
                        label="Provider",
                        placeholder="e.g., together-ai",
                        info="Enter inference provider name"
                    )
                with gr.Column(scale=1):
                    launch_single_btn = gr.Button("Launch Job", variant="primary")

            # Batch action buttons
            with gr.Row():
                launch_btn = gr.Button("Launch All Jobs", variant="secondary", scale=2)
                relaunch_failed_btn = gr.Button("Relaunch Failed", variant="stop", scale=1)
                refresh_btn = gr.Button("πŸ”„ Refresh", variant="secondary", scale=1)

            output = gr.Textbox(label="Status", interactive=False)

            # Summary statistics
            summary_stats = gr.Markdown(value=get_summary_stats())

            with gr.Row():
                with gr.Column():
                    gr.Markdown("## Job Results")
                    results_table = gr.Dataframe(
                        value=get_results_table(),
                        interactive=True,
                        show_search="search",
                        show_copy_button=True,
                        show_fullscreen_button=True,
                        wrap=True,
                        static_columns=list(range(11)),
                        datatype=["str", "str", "str", "str", "str", "str", "str", "str", "str", "str", "html", "str"],
                        elem_id="results_table"
                    )


            # Event handlers
            def launch_single_and_update(model: str, provider: str):
                """Launch multiple jobs for a model-provider combination and return updated table and stats."""
                if not model or not provider:
                    return "❌ Please provide both model and provider", get_results_table(), get_summary_stats()

                job_ids = run_multiple_jobs(model, provider, globals.TASKS)
                if not job_ids:
                    return "❌ Failed to launch jobs (may already be running)", get_results_table(), get_summary_stats()

                save_results()
                return f"βœ… Launched {len(job_ids)} jobs for {model} on {provider}", get_results_table(), get_summary_stats()

            launch_single_btn.click(
                fn=launch_single_and_update,
                inputs=[model_input, provider_input],
                outputs=[output, results_table, summary_stats]
            )

            def launch_and_update():
                """Launch jobs and return updated table and stats."""
                result = launch_jobs()
                return result, get_results_table(), get_summary_stats()

            def relaunch_and_update():
                """Relaunch failed jobs and return updated table and stats."""
                result = relaunch_failed_jobs()
                return result, get_results_table(), get_summary_stats()

            launch_btn.click(
                fn=launch_and_update,
                outputs=[output, results_table, summary_stats]
            )

            relaunch_failed_btn.click(
                fn=relaunch_and_update,
                outputs=[output, results_table, summary_stats]
            )

            def refresh_display():
                """Refresh the table and stats display."""
                return get_results_table(), get_summary_stats()

            refresh_btn.click(
                fn=refresh_display,
                outputs=[results_table, summary_stats]
            )

            # Handle dataframe cell selection for relaunch
            def handle_table_select(evt: gr.SelectData):
                """Handle when a cell in the results table is clicked."""
                print(f"[Relaunch] Cell selected - Row: {evt.index[0]}, Col: {evt.index[1]}, Value: {evt.value}")

                # If we selected a "rerun" cell, we relaunch a job
                if evt.index[1] == 11:
                    # Get the full row data from the dataframe
                    df = get_results_table()
                    row_data = df.data.iloc[evt.index[0]]

                    model = row_data['Model']
                    provider = row_data['Provider']
                    print(f"[Relaunch] Relaunching {globals.NUM_RUNS_PER_JOB} jobs - Model: {model}, Provider: {provider}")

                    run_multiple_jobs(model, provider, globals.TASKS)
                    # Save after relaunch
                    save_results()

                # Then update the table and stats
                return get_results_table(), get_summary_stats()

            results_table.select(
                fn=handle_table_select,
                inputs=[],
                outputs=[results_table, summary_stats]
            )

            # Auto-refresh table and stats every 30 seconds
            def auto_refresh():
                """Auto-refresh table and summary stats."""
                return get_results_table(), get_summary_stats()

            # Create a timer for auto-refresh
            timer = gr.Timer(value=30, active=True)
            timer.tick(
                fn=auto_refresh,
                inputs=[],
                outputs=[results_table, summary_stats]
            )
        with gr.Tab("About"):
            gr.Markdown("""
In this demo, we run 10 samples of 3 evaluations: ifeval (instruction following), gsm_plus (grade school math problems, less contaminated than gsm8k) and gpqa, diamond subset (knowledge), with `lighteval`, `inference-providers` and `jobs`.

The "status" column indicates whether the evaluation failed completely (usually because of the provider was down or because we were rate limited).

To run any of these locally, you can use the following
```python
from huggingface_hub import run_job, inspect_job, whoami
job = run_job(
    image="hf.co/spaces/OpenEvals/EvalsOnTheHub",
    command=[
        "lighteval", "endpoint", "inference-providers", 
        "model_name=MODEL,provider=PROVIDER", 
        "extended|ifeval|0,lighteval|gpqa:diamond|0", 
        "--push-to-hub", "--save-details", 
        "--results-org", "YOURORG"
    ],
    namespace="huggingface",
    secrets={"HF_TOKEN": YOURTOKEN},
    token=YOURTOKEN
)
```
""")

    return demo


if __name__ == "__main__":
    # Load previous results
    load_results()
    print("Starting Inference Provider Testing Dashboard")

    # Start status monitor thread
    monitor_thread = threading.Thread(target=status_monitor, daemon=True)
    monitor_thread.start()
    print("Job status monitor started")

    # Start APScheduler for daily checkpoint
    scheduler = BackgroundScheduler()
    scheduler.add_job(daily_checkpoint, 'cron', hour=0, minute=0)  # Run at midnight
    scheduler.start()
    print("Daily checkpoint scheduler started (saves at 00:00)")

    # Create and launch the Gradio interface
    demo = create_app()
    demo.launch(server_name="0.0.0.0", server_port=7860)